爬虫 - Ikko's Blog

初始代码

import requests
import re
from bs4 import BeautifulSoup
from pandas import DataFrame

url = 'http://www.gov.cn/zhengce/zuixin.htm'
UA = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
headers = {'User_Agent': UA}

r = requests.get(url, headers=headers)
r.encoding = 'utf-8'

soup = BeautifulSoup(r.text, 'lxml')
attrs = {'class': 'date'}
links = soup.find_all(href=re.compile('content'))
dates = soup.find_all(name='span', attrs=attrs)

# Get titles and links
titles = []
urls = []
for link in links:
    titles.append(str(link.string))
    url = link.get('href')
    urls.append(str(url))

# Get days
days = []
pattern = re.compile('(\d+)\-(\d+)\-(\d+)')
for date in dates:
    s = date.string
    day = re.search(pattern, s)
    days.append(str(day.group()))

data = {'date': days,
        'title': titles,
        'url': urls}
frame = DataFrame(data)
frame.to_csv('test.csv', index=False)

个人理解

上面这段代码主要的目的是抓取中国政府网（www.gov.cn）中最新的法规政策，并将其日期、标题和链接保存到CSV文件中。

import requests: 引入requests库，用于网络请求操作。
import re: 引入re库，用于正则表达式匹配。
from bs4 import BeautifulSoup: 引入BeautifulSoup库，用于解析HTML页面。
from pandas import DataFrame: 引入DataFrame库，用于构建数据表格。

接下来是变量定义：

url = 'http://www.gov.cn/zhengce/zuixin.htm'：指定要爬取数据的网站URL。
UA = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'：设置User-Agent，模拟浏览器进行访问。
headers = {'User_Agent': UA}: 设置请求头部信息，包括User-Agent。
r = requests.get(url, headers=headers)：发送HTTP GET请求获取网站页面源码。
r.encoding = 'utf-8'：设置网页内容的编码格式为UTF-8。
soup = BeautifulSoup(r.text, 'lxml')：使用BeautifulSoup库解析HTML页面。
attrs = {'class': 'date'}：设定属性字典{‘class’: ‘date’}，以查找所有class为’date’的span标签。
links = soup.find_all(href=re.compile('content'))：使用正则表达式查找href属性中包含’content’字符串的所有a标签。
dates = soup.find_all(name='span', attrs=attrs)：查找所有class为’date’的span标签，并通过attrs参数和name参数指定进一步的筛选条件。

接下来，代码获取标题、链接和日期数据：

使用for循环遍历所有查找到的links元素，将其中的标题保存到变量titles列表中，将链接地址存储到urls列表中。
使用正则表达式从日期的字符串中提取出日期信息，并将其追加到days列表中。

最后，该程序创建了一个数据字典data，并使用DataFrame类构建了一个数据表格frame。然后，将这个表格以CSV格式写入test.csv文件中，其中index=False指定不要写入索引值（即第一列）到文件中。
爬出数据是这样的：

date,title,url
2023-04-18,中共中央印发《中央党内法规制定工作规划纲要（2023－2027年）》,/zhengce/2023-04/18/content_5752088.htm
2023-04-18,国务院办公厅关于调整第19届亚运会和第4届亚残运会工作领导小组组成人员等有关事项的通知,http://www.gov.cn/zhengce/content/2023-04/18/content_5752017.htm
2023-04-14,国务院办公厅关于上市公司独立董事制度改革的意见,http://www.gov.cn/zhengce/content/2023-04/14/content_5751463.htm
2023-04-12,征兵工作条例,http://www.gov.cn/zhengce/content/2023-04/12/content_5750986.htm
2023-04-10,中共中央发出关于学习《习近平著作选读》第一卷、第二卷的通知,/zhengce/2023-04/10/content_5750697.htm
2023-04-07,国务院办公厅关于成立第五次全国经济普查领导小组的通知,http://www.gov.cn/zhengce/content/2023-04/07/content_5750375.htm

代码修改

url修改

首先可以看到爬取的url是不规范的，有的是绝对url，有的是相对url，这样就需要对url进行处理，使其都是绝对url。对于不规范的URL，我们可以使用Python中的urllib.parse.urljoin()函数将相对URL转化为绝对URL。该函数可以将基础URL和相对URL合并成一个完整的URL，并返回结果。
通过调用urljoin()函数，将上述示例中的URL处理为绝对URL：

from urllib.parse import urljoin

base_url = 'http://www.gov.cn/'
data = [
    ('2023-04-18', '中共中央印发《中央党内法规制定工作规划纲要（2023－2027年）》', '/zhengce/2023-04/18/content_5752088.htm'),
    ('2023-04-18', '国务院办公厅关于调整第19届亚运会和第4届亚残运会工作领导小组组成人员等有关事项的通知', 'http://www.gov.cn/zhengce/content/2023-04/18/content_5752017.htm'),
    ('2023-04-14', '国务院办公厅关于上市公司独立董事制度改革的意见', 'http://www.gov.cn/zhengce/content/2023-04/14/content_5751463.htm'),
    ('2023-04-12', '征兵工作条例', 'http://www.gov.cn/zhengce/content/2023-04/12/content_5750986.htm'),
    ('2023-04-10', '中共中央发出关于学习《习近平著作选读》第一卷、第二卷的通知', '/zhengce/2023-04/10/content_5750697.htm'),
    ('2023-04-07', '国务院办公厅关于成立第五次全国经济普查领导小组的通知', 'http://www.gov.cn/zhengce/content/2023-04/07/content_5750375.htm')
]

for date, title, url in data:
    full_url = urljoin(base_url, url)
    print(date, title, full_url)

对于相对URL，urljoin()函数会自动补全为绝对URL，而对于绝对URL，则不会进行任何更改。

修改成功后发现，绝对url和相对url对应的页面结构是不同的

添加判断url为相对url还是绝对url

可以使用Python的urllib.parse.urlparse()函数将URL解析成6个部分（scheme，netloc，path，params，query和fragment），然后检查其中是否包含netloc字段，如果netloc为空，就是相对URL，否则就是绝对URL。

from urllib.parse import urlparse

def is_relative_url(url):
    parsed_url = urlparse(url)
    return not bool(parsed_url.netloc)

# 测试
print(is_relative_url('/zhengce/2023-04/18/content_5752088.htm'))     # True
print(is_relative_url('http://www.gov.cn/zhengce/content/2023-04/14/content_5751463.htm'))     # False

先处理相对url

相对url有四个字段：标题，时间，来源，正文

提取标题

import requests
import re
from bs4 import BeautifulSoup
from pandas import DataFrame
url = 'http://www.gov.cn/zhengce/2023-04/18/content_5752088.htm'
UA = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
headers = {'User_Agent': UA}

r = requests.get(url, headers=headers)
r.encoding = 'utf-8'

soup = BeautifulSoup(r.text, 'lxml')

title_element = soup.find('h1')
title = title_element.get_text().strip()

print(title)  # 输出 "中共中央印发《中央党内法规制定工作规划纲要（2023－2027年）》"

提取发布时间和来源

# 使用class属性查找包含发布时间和来源的div元素
pages_date = soup.find('div', class_='pages-date')

# 获取发布时间和来源的文本内容
publish_time = pages_date.contents[0].strip()
# 将日期字符串转换为datetime类型的数据
publish_time = datetime.strptime(publish_time, '%Y-%m-%d %H:%M')
source = pages_date.find('span', class_='font').text.strip().replace('来源：', '')


print("发布时间：", publish_time)
print("来源：", source)

提取正文

# 使用id属性查找包含正文内容的div元素
pages_content = soup.find('div', id='UCAP-CONTENT')

# 查找正文内容中的所有p元素
p_tags = pages_content.find_all('p')

# 遍历所有的p标签，并将它们的文本内容连接起来
text = ''
for p_tag in p_tags:
    # 判断该p标签内是否包含<span>标签
    if p_tag.find('span'):
        # 使用extract()方法将该<span>标签从文档中去除
        p_tag.find('span').extract()
    
    # 将该p标签的文本内容连接到text变量中
    text += p_tag.text.strip() + '\n'
print(text)

处理绝对url

绝对url有主题分类，发文机关，标题，发文字号，成文日期，发布日期,正文

from datetime import datetime

import requests
import re
from bs4 import BeautifulSoup
from pandas import DataFrame
url = 'http://www.gov.cn/zhengce/content/2023-04/12/content_5750986.htm'
UA = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
headers = {'User_Agent': UA}

r = requests.get(url, headers=headers)
r.encoding = 'utf-8'

soup = BeautifulSoup(r.text, 'lxml')




# 找到包含信息的表格标签
info_table = soup.find('table', {'class': 'bd1'})

# 找到每个信息所在的表格行，并提取数据
rows = info_table.find_all('tr')

topic_category = rows[0].find_all('td')[3].text.strip()
publishing_organization = rows[1].find_all('td')[1].text.strip()
title = rows[2].find_all('td')[1].text.strip()
document_number = rows[3].find_all('td')[1].text.strip()
written_date = rows[1].find_all('td')[3].text.strip()
release_date = rows[3].find_all('td')[3].text.strip()
written_date = datetime.strptime(written_date, '%Y年%m月%d日')
release_date = datetime.strptime(release_date, '%Y年%m月%d日')
# 打印提取到的数据
print('主题分类：', topic_category)
print('发文机关：', publishing_organization)
print('标题：', title)
print('发文字号：', document_number)
print('成文日期：', written_date)
print('发布日期：', release_date)







# 使用id属性查找包含正文内容的div元素
pages_content = soup.find('td', id='UCAP-CONTENT')

# 查找正文内容中的所有p元素
p_tags = pages_content.find_all('p')

# 遍历所有的p标签，并将它们的文本内容连接起来
text = ''
for p_tag in p_tags:
    # 判断该p标签内是否包含<span>标签
    if p_tag.find('span'):
        # 使用extract()方法将该<span>标签从文档中去除
        p_tag.find('span').extract()
    
    # 将该p标签的文本内容连接到text变量中
    text += p_tag.text.strip() + '\n'
print(text)

确保数据不重复

使用pickle模块将已经爬取过的url保存到本地，下次爬取时，先从本地读取已经爬取过的url，然后再进行爬取，这样就可以确保数据不重复了。

import pickle
with open('data.pickle', 'rb') as f:
    visited_urls = pickle.load(f)
    print(visited_urls)

最终代码

import requests
import re
from bs4 import BeautifulSoup
from pymysql import Error
from urllib.parse import urlparse
from datetime import datetime
import pickle
import pymysql


def crawl_and_process(urls_to_crawl):
    conn = pymysql.connect(host='', user='root', password='', database='search')

    # 创建游标对象
    cursor = conn.cursor()

    # 查询数据库中最大的主键值
    cursor.execute('SELECT MAX(policy_id) FROM search_policy')
    result = cursor.fetchone()
    max_id = result[0] if result[0] else 0
    
    # 将新数据的主键值设置为查询到的最大主键值加1
    new_id = int(max_id) + 1
    print(23,new_id)
    new_id = str(new_id)
    # 读取已经爬取的URL
    try:
        
        with open('data.pickle', 'rb') as f:
            visited_urls = pickle.load(f)
            print(visited_urls)
    except FileNotFoundError:
        visited_urls = set() #实际我生成的是列表
    
    data1 =[]
    count1 = 0
    count2 = 0
    data2=[]
    sql1 = 'INSERT INTO search_policy (policy_id, policy_title, pub_time, pub_agency, policy_body,policy_grade) VALUES (%s, %s,  %s, %s, %s, %s)'
    sql2 = 'INSERT INTO search_policy (policy_id, policy_title, pub_agency,pub_time,UPDATE_DATE,pub_number, policy_body,policy_grade) VALUES (%s, %s,  %s, %s,  %s, %s, %s, %s)'
    for url in urls_to_crawl:
        
        # 判断URL是否已经被爬取过
        if url in visited_urls:
            
            continue
        
        # 爬取URL并进行数据处理
        if '/content/' not in url:
            print('正在爬取：', url)
            data1.append(relativeurl(url,new_id))
            new_id = int(new_id) + 1
            print(52,new_id)
            new_id = str(new_id)
            
        else:
            print('正在爬取：', url)
            data2.append(absoluteurl(url,new_id))
            new_id = int(new_id) + 1
            
            new_id = str(new_id)
            
        # 将已经爬取的URL添加到visited_urls中
        
        print(64)    
        visited_urls.append(url)
        print(66,)
    if len(data1) > 0:
        print('enter')
        try:
            cursor.executemany(sql1, data1)
            conn.commit()
        except Error as e:
            print(e)
            conn.rollback()
        print('ok')
    if len(data2) > 0:
        print('enter')
        cursor.executemany(sql2, data2)
        conn.commit()
        print('ok')
    print(74)
    # 保存visited_urls到pickle文件中
    with open('data.pickle', 'wb') as f:
        pickle.dump(visited_urls, f)
    cursor.close()
    conn.close() 


def is_relative_url(url):#判断是否为相对路径
    parsed_url = urlparse(url)
    return not bool(parsed_url.netloc)

def relativeurl(url,new_id):
    
    
    UA = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
    headers = {'User_Agent': UA}

    r = requests.get(url, headers=headers)
    r.encoding = 'utf-8'

    soup = BeautifulSoup(r.text, 'lxml')

    title_element = soup.find('h1')
    title = title_element.get_text().strip()
    
    # 使用class属性查找包含发布时间和来源的div元素
    pages_date = soup.find('div', class_='pages-date')

    # 获取发布时间和来源的文本内容
    publish_time = pages_date.contents[0].strip()
    # 将日期字符串转换为datetime类型的数据
    publish_time = datetime.strptime(publish_time, '%Y-%m-%d %H:%M')
    source = pages_date.find('span', class_='font').text.strip().replace('来源：', '')
    
    # 使用id属性查找包含正文内容的div元素
    pages_content = soup.find('div', id='UCAP-CONTENT')
    
    # 查找正文内容中的所有p元素
    p_tags = pages_content.find_all('p')
    
    # 遍历所有的p标签，并将它们的文本内容连接起来
    text = ''
    for p_tag in p_tags:
        # 判断该p标签内是否包含<span>标签
        if p_tag.find('span'):
            # 使用extract()方法将该<span>标签从文档中去除
            p_tag.find('span').extract()
        
        # 将该p标签的文本内容连接到text变量中
        text += p_tag.text.strip() + '\n'
        
    
    data = (new_id,title, publish_time, source, text,'国家级')
    print(125)
    
    
    return data

    
def absoluteurl(url,new_id):
    UA = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
    headers = {'User_Agent': UA}
    print('正在爬取：', url)
    r = requests.get(url, headers=headers)
    r.encoding = 'utf-8'
    soup = BeautifulSoup(r.text, 'lxml')
    # 找到包含信息的表格标签
    info_table = soup.find('table', {'class': 'bd1'})

    # 找到每个信息所在的表格行，并提取数据
    rows = info_table.find_all('tr')

    topic_category = rows[0].find_all('td')[3].text.strip()
    publishing_organization = rows[1].find_all('td')[1].text.strip()
    title = rows[2].find_all('td')[1].text.strip()
    document_number = rows[3].find_all('td')[1].text.strip()
    written_date = rows[1].find_all('td')[3].text.strip()
    release_date = rows[3].find_all('td')[3].text.strip()
    written_date = datetime.strptime(written_date, '%Y年%m月%d日')
    release_date = datetime.strptime(release_date, '%Y年%m月%d日')
   

    # 使用id属性查找包含正文内容的div元素
    pages_content = soup.find('td', id='UCAP-CONTENT')

    # 查找正文内容中的所有p元素
    p_tags = pages_content.find_all('p')

    # 遍历所有的p标签，并将它们的文本内容连接起来
    text = ''
    for p_tag in p_tags:
        # 判断该p标签内是否包含<span>标签
        if p_tag.find('span'):
            # 使用extract()方法将该<span>标签从文档中去除
            p_tag.find('span').extract()
        
        # 将该p标签的文本内容连接到text变量中
        text += p_tag.text.strip() + '\n'
    
    data = (new_id,title,publishing_organization,written_date,release_date,document_number,text,'国家级')
    
    return data

UA = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
headers = {'User_Agent': UA}
for page in range(1, 83):  # 假设要爬取前100页
    cn = f'http://sousuo.gov.cn/column/30469/{page}.htm'
    # 爬取该页的数据
    try:
        r = requests.get(cn, headers=headers)
        r.encoding = 'utf-8'

        soup = BeautifulSoup(r.text, 'lxml')
        #获取该页的所有政策链接
        links = soup.find_all(href=re.compile('content'))
        urls = []
        for link in links:
            
            url = link.get('href')
            urls.append(str(url))
        print(urls)
        crawl_and_process(urls)
            

    # 关闭游标和连接
           
    except:
        pass