AbyssalSwamp  ActivaUser
» Guest:  Register | Login | 会员列表

RSS subscription to this AbyssalSwamp  

Previous thread Next thread
     
Title: python爬虫设计 --- Xpath 获取网页内容并使用head构造头信息,骗过网站。  
 
xander
超级版主
Rank: 12Rank: 12Rank: 12


UID 1
Digest 2
Points 2
Posts 169
码币MB 309 Code
黄金 0 Catty
钻石 903 Pellet
Permissions 200
Register 2022-2-7
Status offline
python爬虫设计 --- Xpath 获取网页内容并使用head构造头信息,骗过网站。

requests
re
csv
lxml etree
time


():
    ua = {: }
    resp = requests.get(url=ua)
    (resp.status_code)
    html = resp.content.decode()
    html


(html):
    e_html = etree.HTML(html)
    title = e_html.xpath()

    university_index = e_html.xpath()

    university_name = e_html.xpath()

    university_location = e_html.xpath()

    university_score = e_html.xpath()

    university_score_other = e_html.xpath()

    list_university = []
    i ((university_index)):
        d1 = {
            : university_index: university_name: university_location: university_score: university_score_other[i * :i * + ]
        }
        list_university.append(d1)
    (% (title[]))
    list_universitytitle


(list_universitytitle):
    (.format(title[].strip())=) df:
        one list_university:
            df.write(json.dumps(one=) + )


(list_universitytitle):
    dict_university_new = []
    i list_university:
        temp = {
            : i[]: i[]: i[]: i[]: i[][]: i[][]: i[][]: i[][]: i[][]: i[][]: i[][]
        }
        dict_university_new.append(temp)
    (.format(title[].strip())==) cf:
        writer = csv.DictWriter(cf=[])
        writer.writeheader()
        writer.writerows(dict_university_new)


__name__ == :
    i ():
        url = % (i)
        time.sleep()  html = request_pages(url)
        list_university = decode_pages(html)
        save_to_txt(list_university[]list_university[])
        save_to_csv(list_university[]list_university[])


2022-11-1 12:22#1
View profile  Blog  Send a short message  Top
 
xander
超级版主
Rank: 12Rank: 12Rank: 12


UID 1
Digest 2
Points 2
Posts 169
码币MB 309 Code
黄金 0 Catty
钻石 903 Pellet
Permissions 200
Register 2022-2-7
Status offline
RE:python爬虫设计 --- Xpath 获取网页内容并使用head构造头信息,骗过网站。

requests
re
csv
lxml etree
time


():
    ua = {: }
    resp = requests.get(url=ua)
    (resp.status_code)
    html = resp.content.decode()
    html


(html):
    e_html = etree.HTML(html)
    title = e_html.xpath()

    university_index = e_html.xpath()

    university_name = e_html.xpath()

    university_location = e_html.xpath()

    university_score = e_html.xpath()

    university_score_other = e_html.xpath()

    list_university = []
    i ((university_index)):
        d1 = {
            : university_index: university_name: university_location: university_score: university_score_other[i * :i * + ]
        }
        list_university.append(d1)
    (% (title[]))
    list_universitytitle


(list_universitytitle):
    (.format(title[].strip())=) df:
        one list_university:
            df.write(json.dumps(one=) + )


(list_universitytitle):
    dict_university_new = []
    i list_university:
        temp = {
            : i[]: i[]: i[]: i[]: i[][]: i[][]: i[][]: i[][]: i[][]: i[][]: i[][]
        }
        dict_university_new.append(temp)
    (.format(title[].strip())==) cf:
        writer = csv.DictWriter(cf=[])
        writer.writeheader()
        writer.writerows(dict_university_new)


__name__ == :
    i ():
        url = % (i)
        time.sleep()  html = request_pages(url)
        list_university = decode_pages(html)
        save_to_txt(list_university[]list_university[])
        save_to_csv(list_university[]list_university[])


2022-11-1 12:23#2
View profile  Blog  Send a short message  Top
 
xander
超级版主
Rank: 12Rank: 12Rank: 12


UID 1
Digest 2
Points 2
Posts 169
码币MB 309 Code
黄金 0 Catty
钻石 903 Pellet
Permissions 200
Register 2022-2-7
Status offline
RE:python爬虫设计 --- Xpath 获取网页内容并使用head构造头信息,骗过网站。

import requests
import re
import csv
from lxml import etree
import time


# 请求页面
def request_pages(urls):
    ua = {'User-Agent': 'User-Agent:Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'}
    resp = requests.get(url, headers=ua)
    print(resp.status_code)
    # print(resp.text) #直接打印文本
    # print(resp.content.decode('utf-8')) #二进制编码 再转为gbk编码
    html = resp.content.decode('utf-8')
    return html


# 解析页面
def decode_pages(html):
    e_html = etree.HTML(html)
    # 1.提取标题
    title = e_html.xpath('//title/text()')

    # 2.提取大学排名表
    university_index = e_html.xpath('//div[@class="news-text"]//tbody/tr/td[1]/text()')

    # 3.提取大学名
    university_name = e_html.xpath('//div[@class="news-text"]//tbody/tr/td[2]/a/div/text()')

    # 4.提取地区
    university_location = e_html.xpath('//div[@class="news-text"]//tbody/tr/td[3]/text()')

    # 5.提取总分
    university_score = e_html.xpath('//div[@class="news-text"]//tbody/tr/td[4]/text()')

    # 6 提取其他分数
    university_score_other = e_html.xpath('//div[@class="news-text"]//tbody/tr/td[@class="hidden-xs"]/text()')

    # 存入列表
    list_university = []
    for i in range(len(university_index)):
        d1 = {
            '排名': university_index,
            '大学名称': university_name,
            '区域': university_location,
            '总分': university_score,
            '其他评分': university_score_other[i * 7:i * 7 + 7]
        }
        list_university.append(d1)
    print("解析成功:%s" % (title[0]))
    return list_university, title


# 保存txt
def save_to_txt(list_university, title):
    with open(r'e:{}.txt'.format(title[0].strip()), 'a', encoding='utf-8') as df:
        for one in list_university:
            df.write(json.dumps(one, ensure_ascii=False) + ' ')


# 保存csv
def save_to_csv(list_university, title):
    # 剥离其他评分
    dict_university_new = []
    for i in list_university:
        temp = {
            '排名': i['排名'],
            '大学名称': i['大学名称'],
            '区域': i['区域'],
            '总分': i['总分'],
            '研究生比例': i['其他评分'][0],
            '留学生比例': i['其他评分'][1],
            '师生比': i['其他评分'][2],
            '博士学位授权总量': i['其他评分'][3],
            '博士学位授权师均': i['其他评分'][4],
            '校友获奖总量': i['其他评分'][5],
            '校友获奖生均': i['其他评分'][6]
        }
        dict_university_new.append(temp)
    with open(r'e:{}.csv'.format(title[0].strip()), 'w', encoding='gbk', newline='') as cf:
        writer = csv.DictWriter(cf,
                                fieldnames=['排名', '大学名称', '区域', '总分', '研究生比例', '留学生比例', '师生比', '博士学位授权总量', '博士学位授权师均',
                                            '校友获奖总量', '校友获奖生均'])
        writer.writeheader()
        writer.writerows(dict_university_new)


if __name__ == '__main__':
    # 注意:2015以及以前的网页结构发生变化,需要修改解析规则
    for i in range(2016, 2020):
        url = "http://www.zuihaodaxue.com/Greater_China_Ranking%d_0.html" % (i)
        # url = 'http://www.zuihaodaxue.com/Greater_China_Ranking2019_0.html'
        time.sleep(2)  # 间隔为2s
        html = request_pages(url)
        list_university = decode_pages(html)
        # print(list_university)
        save_to_txt(list_university[0], list_university[1])
        save_to_csv(list_university[0], list_university[1])

2022-11-1 12:23#3
View profile  Blog  Send a short message  Top
     


  Printable version | Recommend to a friend | Subscribe to topic | Favorite topic  


 


All times are GMT+8, and the current time is 2026-1-14 00:06 Clear informations ->sessions/cookies - Contact Us - CAFFZ - ZAKE