1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
| import pandas as pd import jieba from wordcloud import WordCloud import matplotlib.pyplot as plt
f = open('月饼数据.txt',encoding='utf-8')
df = pd.read_csv(f,sep=',',names=['title','price','sales','location'])
title = df.title.values.tolist()
title_s = []
for line in title: title_cut = jieba.lcut(line) title_s.append(title_cut)
title_clean = []
stopwords = ["月饼","礼品","口味","礼盒","包邮","【","】","送礼","大","中秋节","中秋月饼","2","饼","蓉","多","个","味","斤","送"," ","老","北京","云南","网红老"]
for line in title_s: line_clean = [] for word in line: if word not in stopwords: line_clean.append(word) title_clean.append(line_clean)
title_clean_dist = []
for line in title_clean: line_dist = [] for word in line: if word not in line_dist: line_dist.append(word) title_clean_dist.append(line_dist)
allwords_clean_dist = []
for line in title_clean_dist: for word in line: allwords_clean_dist.append(word)
df_allwords_clean_dist = pd.DataFrame({'allwords':allwords_clean_dist})
word_count = df_allwords_clean_dist.allwords.value_counts().reset_index() word_count.columns = ['word','count']
wc = WordCloud(width=1920,height=1080,max_words=2000,background_color='white',font_path='simhei.ttf',max_font_size=400,random_state=50) wc = wc.fit_words({x[0]:x[1] for x in word_count.head(100).values}) plt.imshow(wc,interpolation='bilinear') plt.axis("off") plt.show() wc.to_file("data.png")
|