共计 1606 个字符,预计需要花费 5 分钟才能阅读完成。
原理是爬取新浪微博信息然后分词获取常见的词并绘制出相应的图片
# -*- coding:utf-8 -*-
import codecs
import csv
import re
import jieba.analyse
import matplotlib.pyplot as plt
import requests
from scipy.misc import imread
from wordcloud import WordCloud
#cookies自己去chrome查找,事先打开移动端微博并登录然后查看自己的cookies
cookies = {
"ALF": "",
"SCF": "",
"SUBP": ",
"SUB": "",
"SUHB":"",
"SSOLoginState":"",
"M_WEIBOCN_PARAMS":"",
"H5_INDEX":"",
"H5_INDEX_TITLE":""
}
def fetch_weibo():
api = "http://m.weibo.cn/index/my?format=cards&page=%s"
for i in range(1, 50):
response = requests.get(url=api % i, cookies=cookies)
data = response.json()[0]
groups = data.get("card_group") or []
for group in groups:
text = group.get("mblog").get("text")
text = text.encode("utf-8")
def cleanring(content):
"""
去掉无用字符
"""
pattern = "||转发微博|//:|Repost|,|?|。|、|分享图片"
content = re.sub(pattern, "", content)
return content
text = cleanring(text).strip()
if text:
yield text
def write_csv(texts):
with codecs.open('./weibo.csv', 'w') as f:
writer = csv.DictWriter(f, fieldnames=["text"])
writer.writeheader()
for text in texts:
writer.writerow({"text": text})
def read_csv():
with codecs.open('./weibo.csv', 'r') as f:
reader = csv.DictReader(f)
for row in reader:
yield row['text']
def word_segment(texts):
jieba.analyse.set_stop_words("./stopwords.txt")
for text in texts:
tags = jieba.analyse.extract_tags(text, topK=20)
yield " ".join(tags)
def generate_img(texts):
data = " ".join(text for text in texts)
mask_img = imread('./heart-mask.jpg', flatten=True)
wordcloud = WordCloud(
font_path='./simsunb.ttf',
background_color='white',
mask=mask_img
).generate(data)
plt.imshow(wordcloud)
plt.axis('off')
plt.savefig('./heart.jpg', dpi=600)
if __name__ == '__main__':
texts = fetch_weibo()
write_csv(texts)
generate_img(word_segment(read_csv()))
正文完
请博主喝杯咖啡吧!