编程语言
首页 > 编程语言> > python 练习0006

python 练习0006

作者:互联网

问题

你有一个目录,放了你一个月的日记,都是 txt,为了避免分词的问题,假设内容都是英文,请统计出你认为每篇日记最重要的词。

代码

from collections import Counter
import re, os

def get_txt(file_dir):
    try:
        os.chdir(file_dir)
        return [os.path.abspath(file) for file in os.listdir()
                if file.endswith('.txt')]
    except:
        print('Error: dir not exits')
        return None

def count_word(files):
    IGNORED_WORDS = ['a', 'an', 'as', 'is', 'in', 'of', 'to', 'are', 'has', 'and', 'the']
    for file in files:
        word_counts = Counter([])
        with open(file) as f:
            for line in f:
                # 小写
                line = line.lower()
                words = re.findall(r'\b\w+\b', line)
                # 去掉过滤词
                words = [word for word in words
                            if word not in IGNORED_WORDS]
                word_counts.update(words)
        yield (file, word_counts)

def show(results, num):
    for file, word_counts in results:
        print('The most common words in ', file)
        for word, counts in word_counts.most_common(num):
            print('\t' + word + ' appears ', num, ' times')
        print()


if __name__ == '__main__':
    file_dir = './diary'
    files = get_txt(file_dir)
    # print(files)
    results = count_word(files)
    show(results, num=4)

标签:files,0006,word,python,练习,file,print,counts,dir
来源: https://blog.csdn.net/m0_38015368/article/details/89282508