python 练习0006
作者:互联网
问题
你有一个目录,放了你一个月的日记,都是 txt,为了避免分词的问题,假设内容都是英文,请统计出你认为每篇日记最重要的词。
代码
from collections import Counter
import re, os
def get_txt(file_dir):
try:
os.chdir(file_dir)
return [os.path.abspath(file) for file in os.listdir()
if file.endswith('.txt')]
except:
print('Error: dir not exits')
return None
def count_word(files):
IGNORED_WORDS = ['a', 'an', 'as', 'is', 'in', 'of', 'to', 'are', 'has', 'and', 'the']
for file in files:
word_counts = Counter([])
with open(file) as f:
for line in f:
# 小写
line = line.lower()
words = re.findall(r'\b\w+\b', line)
# 去掉过滤词
words = [word for word in words
if word not in IGNORED_WORDS]
word_counts.update(words)
yield (file, word_counts)
def show(results, num):
for file, word_counts in results:
print('The most common words in ', file)
for word, counts in word_counts.most_common(num):
print('\t' + word + ' appears ', num, ' times')
print()
if __name__ == '__main__':
file_dir = './diary'
files = get_txt(file_dir)
# print(files)
results = count_word(files)
show(results, num=4)
标签:files,0006,word,python,练习,file,print,counts,dir 来源: https://blog.csdn.net/m0_38015368/article/details/89282508