zhilong
/
Intro-to-machine-learning


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
							#!/usr/bin/python

from nltk.stem.snowball import SnowballStemmer
import string

def parseOutText(f):
    """ given an opened email file f, parse out all text below the
        metadata block at the top
        (in Part 2, you will also add stemming capabilities)
        and return a string that contains all the words
        in the email (space-separated) 
        
        example use case:
        f = open("email_file_name.txt", "r")
        text = parseOutText(f)
        
        """


    f.seek(0)  ### go back to beginning of file (annoying)
    all_text = f.read()

    ### split off metadata
    content = all_text.split("X-FileName:")
    words = ""
    if len(content) > 1:
        ### remove punctuation
        text_string = content[1].translate(str.maketrans("", "", string.punctuation))

        ### project part 2: comment out the line below
        #words = text_string

        ### split the text string into individual words, stem each word,
        ### and append the stemmed word to words (make sure there's a single
        ### space between each stemmed word)
    str_list = text_string.split(" ")
    #print(str_list)
    #不知道为什么会有空的元素夹杂在里边，所以写个循环删掉空的元素
    for txt in str_list:
        if txt == "":
            str_list.remove("")
    
    stemmer = SnowballStemmer("english")
    for x in range(0,len(str_list)):
        str_list[x] = stemmer.stem(str_list[x])
        
    # 将一个列表转换为字符串并且用空格隔开
    words = " ".join(str_list)
    #words = str_list
    
    #print("The type is", type(str_list))
    return words

    
def main():
    ff = open("../text_learning/test_email.txt", "r")
    text = parseOutText(ff)
    print(text)


if __name__ == '__main__':
    main()