本文最后更新于 208 天前,其中的信息可能已经有所发展或是发生改变。
一、正则
import re
s='''<li><a href="http://news.sina.com.cn/o/2018-11-06/a75.shtml" target="_blank">进博会</a></li>
<li><a href="http://news.sina.com.cn/o/2018-11-06/a76.shtml" target="_blank">大数据</a></li>
<li><a href="/o/2018-11-06/a75.shtml" target="_blank">进博会</a></li>'''
urls=re.findall('<a href="[a-zA-Z0-9/\.\-:]+"', s) #使用re.findall 和正则表达式获取到网页链接
print(urls)
for url in urls:
print(url[9:len(url)-1])
二、判断能否抓取
import urllib.robotparser
import requests
#读取robots.txt文件
rp = urllib.robotparser.RobotFileParser()
rp.set_url("https://item.taobao.com/robots.txt") #查看网页的robots协议
rp.read()
#模拟Googlebot,能生成文件
useragent='Googlebot'
#模拟Baiduspider,不能生成文件
#useragent='Baiduspider'
url='https://item.taobao.com/item.htm?spm=a219r.lm897.14.38.5d2346e28rO73l&id=522811099442&ns=1&abbucket=7'
if rp.can_fetch(useragent, url):
print("允许抓取")
file=requests.get(url)
data=file.content #读取全部
fb=open("bd-html","wb") #将爬取的网页保存在本地
fb.write(data)
fb.close()
else:
print("不允许抓取")
三、cookies
# python3.6环境
import requests
import re
#从浏览器的开发者模式复制Cookie,保存到文本文件taobao-hk.txt
f=open(r'taobao-dd.txt','r') #打开所保存的cookies内容文件
cookies={}#初始化cookies字典变量
for line in f.read().split(';'): #按照字符:进行划分读取 #其设置为1就会把字符串拆分成2份
name,value=line.strip().split('=',1)
cookies[name]=value #为字典cookies添加内容
r=requests.get("https://www.taobao.com/",cookies=cookies)
#print(r.text)
rs=re.findall(u'<title>.*</title>',r.text) #<title>淘宝网 - 淘!我喜欢</title>
print(rs)
四、建立MYHTMLParser
from html.parser import HTMLParser
class MyHTMLParser(HTMLParser):
ctag=False
def handle_starttag(self, tag, attrs):
print('begin a tag:'+tag)
if tag=='h1':
for attr in attrs:
print(attr[0])
if attr[1]=='center':
self.ctag=True
break
def handle_data(self, data):
print('handle a tag')
if self.ctag==True:
print("Extracted data :", data)
def handle_endtag(self, tag):
print('end a tag:'+tag)
self.ctag=False
parser = MyHTMLParser()
parser.feed('<html><head><title>Test</title></head>'
'<body><h1 align="center">Big data news</h1><h1 align="center">'
'AI news</h1><h1 align="right">2018.8.1</h1></body></html>')
五、xpath
#coding:utf-8
from lxml import etree
html='<html><head><title>Test</title></head><body><h1 align="center">Big data news</h1><h1 align="center">AI news</h1><h1 align="right">2018.8.1</h1></body></html>'
content = etree.fromstring(html)
rows=content.xpath('/html/body/h1')
for row in rows:
t=row.xpath('./text()')[0]
print(t)
html = '<html><head><title>Test</title></head><body><table id="table1"cellspacing="0px"><tr><th>学号</th><th>姓名</th><th>成绩</th></tr><tr><td>1001</td><td>曾平</td><td>90</td></tr><tr><td>1002</td><td>王一</td><td>92</td></tr><tr><td>1003</td><td>张三</td><td>88</td></tr></table></body></html>'
content = etree.HTML(html)
rows = content.xpath('//table[@id="table1"]/tr')[1:]
for row in rows:
id = row.xpath('./td[1]/text()')[0]
name = row.xpath('./td[2]/text()')[0]
score = row.xpath('./td[3]/text()')[0]
print(id, name, score)
print("演示提示最后一个记录")
html = '''<html><head><title>Test</title></head><body><table id="table1"cellspacing="0px"><tr><th>学号</th><th>姓名</th><th>成绩</th></tr>
<tr><td>1001</td><td>曾平</td><td>90</td></tr>
<tr><td>1002</td><td>王一</td><td>92</td></tr>
<tr><td>1003</td><td>张三</td><td>88</td></tr>
</table></body></html>'''
content = etree.HTML(html)
rows = content.xpath('//table[@id="table1"]/tr[last()]')
for row in rows:
id = row.xpath('./td[1]/text()')[0]
name = row.xpath('./td[2]/text()')[0]
score = row.xpath('./td[3]/text()')[0]
print(id, name, score)
六、通过特定的tree来处理,提取指定内容
# -*- coding: utf-8 -*-
import html5lib
print('通过指定treebuilder来解析:')
document='<html><head><title>Test</title></head><body><h1 align="center">Big data news</h1><h1 align="center">AI news</h1><h1 align="right">2018.8.1</h1></body></html>'
#直接调用html5lib.parse来解析,解析时采用lxml构建树的方法
content=html5lib.parse(document,treebuilder="lxml",namespaceHTMLElements=False)
#指定要提取的内容所在的标签路径
rows=content.xpath('/html/body/h1')
for row in rows:
t=row.xpath('./text()')[0] #定位到标签节点后,通过text()提取内容
print(t)
print('通过指定tree来解析:')
document='<html><head><title>Test</title></head><body><h1 align="center">Big data news</h1><h1 align="center">AI news</h1><h1 align="right">2018.8.1</h1></body></html>'
#构造HTMLParser实例,指定构造lxml的树
p=html5lib.HTMLParser(strict=False,tree=html5lib.getTreeBuilder('lxml'),namespaceHTMLElements=False)
#解析HTML文档
t=p.parse(document)
rows=t.xpath('/html/body/h1')
for row in rows:
t=row.xpath('./text()')[0]
print(t)
print('通过指定tree来提取超链接:')
document='<html><head><title>Test</title></head><body><a href="www.baidu.com">baidu</body></html>'
p=html5lib.HTMLParser(strict=False,tree=html5lib.getTreeBuilder('lxml'),namespaceHTMLElements=False)
t=p.parse(document)
#通过findall来查找所有标签名称为a的节点
a_tags = t.findall(".//a")
for a in a_tags:
url = a.attrib["href"] #通过属性名称来获得属性值
print(url)
print('处理标签不完整或有错误的HTML:')
#这个HTML文档中有不完整的标签:缺少一个</h1>、有错误的标签:</m1>
document='<html><head><title>Test</title></head><body><h1 align="center">Big data news</h1><h1 align="center">AI news<h1 align="right">2018.8.1</m1></body></html>'
p=html5lib.HTMLParser(strict=False,tree=html5lib.getTreeBuilder('lxml'),namespaceHTMLElements=False)
t=p.parse(document)
rows=t.xpath('/html/body/h1')
for row in rows:
t=row.xpath('./text()')[0]
print(t)
七、爬取特定内容
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
html='''
<html><body><div id="second-title">访华前 这个国家的总理说“感谢中国体谅”</div>
<div class="date-source">
<span class="date">2019年03月27日 21:30</span></div>
<span class="publish source">参考消息</span>
<div class="article">
<p>原标题:锐参考 | 访华前,这个国家的总理说:“感谢中国体谅!”</p>
<p>“非常感谢中国的理解!”</p>
<p>在25日的新闻发布会上,新西兰总理杰辛达·阿德恩这样说道。</p>
</div>
</body></html>
'''
soup = BeautifulSoup(html, 'lxml')
#id名前加#
title = soup.select('div#second-title')[0].text
#类名(class)前加点
date=soup.select('span.date')[0].text
#类名中的空格用点替换,即publish.source
source=soup.select('span.publish.source')[0].text
#子标签通过 > 定义
content = soup.select('div.article > p')
contentstr = ''
for i in range(len(content)):
contentstr += content[i].text+"\n"
print("标题:",title)
print("发布日期:",date)
print("消息来源:",source)
print("消息内容:", contentstr)
八、选择已经爬取的内容
# -*- coding: utf-8 -*-
from pyquery import PyQuery
html='''
<html><body><div id="second-title">访华前 这个国家的总理说“感谢中国体谅”</div>
<div class="date-source">
<span class="date">2019年03月27日 21:30</span></div>
<span class="publish source">参考消息</span>
<div class="article">
<p>原标题:锐参考 | 访华前,这个国家的总理说:“感谢中国体谅!”</p>
<p>“非常感谢中国的理解!”</p>
<p>在25日的新闻发布会上,新西兰总理杰辛达·阿德恩这样说道。</p>
</div>
</body></html>
'''
py = PyQuery(html)
#id名前加#
title = py('div#second-title')[0].text
#类名(class)前加点
date=py('span.date')[0].text
#类名中的空格用点替换,即publish.source
source=py('span.publish.source')[0].text
#子标签通过 > 定义
content = py('div.article > p')
contentstr = ''
for i in range(len(content)):
contentstr += content[i].text+"\n"
print("标题:",title)
print("发布日期:",date)
print("消息来源:",source)
print("消息内容:", contentstr)
xs=py('body').find('div')
d=xs(".date-source .date")
print(d[0].text)
九、依据特征词汇,爬取特定url
# -*- coding: utf-8 -*-
import urllib.robotparser
import requests
from bs4 import BeautifulSoup
import jieba
from gensim.corpora.dictionary import Dictionary
import os
import re
#保存文件
def savefile(file_dir, content,seq):
file_path = file_dir + os.sep + str(seq)+'.html'
f = open(file_path, "wb")
f.write(content.encode("utf-8")) #编码成字节
f.close()
#设置http头部属性
useragent='Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0'
http_headers = {
'User-Agent':useragent,
'Accept': 'text/html'
}
#使用关键词集合方式来定义
topicwords={"网络","安全","法案","预警","设施","互联网"}
website='http://roll.news.sina.com.cn/'
url='http://roll.news.sina.com.cn/news/gnxw/gdxw1/index.shtml'
file_dir='e:\\xxx'
rp = urllib.robotparser.RobotFileParser()
rp.set_url(website+"robots.txt")
rp.read()
#确保Robots中许可访问
if rp.can_fetch(useragent, url):
page = requests.get(url, headers=http_headers)
page.encoding = 'gb2312'
content=page.text
#装载停用词列表
stoplist=open('stopword.txt','r',encoding="utf-8").readlines()
stoplist = set(w.strip() for w in stoplist)
#提取形如 href="http://news.sina.com.cn/o/2018-11-06/doc-ihmutuea7351575.shtml" 的字符串
ulist=re.findall('href="http://[a-z0-9/.\-]+\.shtml',content)
i=1
for u in ulist:
u=u[6:]
print(u)
page = requests.get(u, headers=http_headers)
page.encoding = 'utf-8'
content=page.text
bs=BeautifulSoup(content,'lxml')
ps=bs.select('div#article > p')
ptext=''
doc=[]
for p in ps:
p=p.text.strip("\n")
if p!="" :
d=[]
#词汇切分、过滤
for w in list(jieba.cut(p,cut_all=True)):
if len(w)>1 and w not in stoplist:
d.append(w)
doc.append(d)
#print(doc)
#特征选择,假设依据是:词汇至少出现2次,而且词汇所在的段落数/总的段落数<=1.0
#选择符合这两个条件的前10个词汇作为页面内容的代表
dictionary = Dictionary(doc)
dictionary.filter_extremes(no_below=2, no_above=1.0, keep_n=10)
d=dict(dictionary.items())
docwords=set(d.values())
#相关度计算: topicwords和docwords集合的相似度
commwords=topicwords.intersection(docwords)
sim=len(commwords)/(len(topicwords)+len(docwords)-len(commwords))
#如果相似度满足设定的要求,则认为主题相关,可以保存到文件。
if sim>0.1:
print(docwords)
print("sim=",sim)
savefile(file_dir, content,i)
i=i+1
else:
print('不允许抓取!')