1.选一个自己感兴趣的主题。
2.用python 编写爬虫程序,从网络上爬取相关主题的数据。
3.对爬了的数据进行文本分析,生成词云。
4.对文本分析结果进行解释说明。
5.写一篇完整的博客,描述上述实现过程、遇到的问题及解决办法、数据分析思想及结论。
6.最后提交爬取的全部数据、爬虫及数据分析源代码。
# -*- coding: UTF-8 -*-# -*- import requests import re import jieba import locale locale=locale.setlocale(locale.LC_CTYPE, 'chinese') from bs4 import BeautifulSoup from datetime import datetime url = "http://ent.chinadaily.com.cn/" res = requests.get(url) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') def getKeyWords(text): str = '''一!“”,。?、;’"',.、·《》()#\t:\n''' for s in str: text = text.replace(s, '') newsList=list(jieba.lcut(text)) newsDict = {} deleteList = [] for i in newsDict.keys(): if len(i) < 2: deleteList.append(i) # 生成单字无意义字符列表 for i in deleteList: del newsDict[i] # 在词云字典中删除无意义字符 newsSet = set(newsList) - set(deleteList) for i in newsSet: newsDict[i] = newsList.count(i) # 生成词云字典 dictList = list(newsDict.items()) dictList.sort(key=lambda x: x[1], reverse=True) def getNewDetail(newsUrl): resd = requests.get(newsUrl) resd.encoding = 'utf-8' soupd = BeautifulSoup(resd.text, 'html.parser') title = soupd.select('h1')[0].text info = soupd.select('.xinf-le')[0].text t = soupd.select('#pubtime')[0].text dt = datetime.strptime(t, ' %Y-%m-%d %H:%M:%S') # source = soupd.select('#source')[0].text.lstrip(' 来源:') biaoqian = soupd.select('.fenx-bq')[0].text.lstrip('标签:') if info.find('作者:') > 0: au = info[info.find('作者:'):].split()[0].lstrip('作者:') else: au = 'none' if info.find('来源:') > 0: source = info[info.find('来源:'):].split()[0].lstrip('来源:') else: source = 'none' content = soupd.select('#Content')[0].text.strip() print("标题:", title) print("作者:",au) print("来源:",source) print("发布时间:", dt) print("正文:",content) print("标签:", biaoqian) getKeyWords(content) fo = open('D:\python/news.txt', 'a+', encoding='UTF-8') fo.write('标题:'+title+'\n'+"作者:"+au+'\n'+"来源:"+source+'\n'+"正文:"+content+'\n'+"标签:"+biaoqian) fo.write('\n') fo.close() def getListPage(ListPageUrl): res = requests.get(ListPageUrl) res.encoding = 'utf-8' soupd = BeautifulSoup(res.text, 'html.parser') pagedetail = [] # 存储一页所有新闻的详情 for news in soupd.select('.busBox1'): atail = news.a.attrs['href'] # a = 'http://ent.chinadaily.com.cn/' + atail getNewDetail(atail) pagedetail = getListPage('http://ent.chinadaily.com.cn/node_53008149.htm') for i in range(2, 10): listUrl='http://ent.chinadaily.com.cn/node_53008149_{}.htm' pagedetail = getListPage(listUrl)