AbyssalSwamp Technology Exchange - 戈黎塔尼港Port of Goritani - 百科文库[Library] - python爬虫设计 --- Xpath 获取网页内容并使用head构造头信息，骗过网站。

AbyssalSwamp » 戈黎塔尼港Port of Goritani - 百科文库[Library] » python爬虫设计 --- Xpath 获取网页内容并使用head构造头信息，骗过网站。

Previous thread

Next thread

xander
超级版主
Rank: 12

UID 1
Digest 2
Points 2
Posts 169
码币MB 309 Code
黄金 0 Catty
钻石 903 Pellet
Permissions 200
Register 2022-2-7
Status offline

python爬虫设计 --- Xpath 获取网页内容并使用head构造头信息，骗过网站。

requests
re
csv
lxml etree
time


():
    ua = {: }
    resp = requests.get(url=ua)
    (resp.status_code)
    html = resp.content.decode()
    html


(html):
    e_html = etree.HTML(html)
    title = e_html.xpath()

    university_index = e_html.xpath()

    university_name = e_html.xpath()

    university_location = e_html.xpath()

    university_score = e_html.xpath()

    university_score_other = e_html.xpath()

    list_university = []
    i ((university_index)):
        d1 = {
            : university_index: university_name: university_location: university_score: university_score_other[i * :i * + ]
        }
        list_university.append(d1)
    (% (title[]))
    list_universitytitle


(list_universitytitle):
    (.format(title[].strip())=) df:
        one list_university:
            df.write(json.dumps(one=) + )


(list_universitytitle):
    dict_university_new = []
    i list_university:
        temp = {
            : i[]: i[]: i[]: i[]: i[][]: i[][]: i[][]: i[][]: i[][]: i[][]: i[][]
        }
        dict_university_new.append(temp)
    (.format(title[].strip())==) cf:
        writer = csv.DictWriter(cf=[])
        writer.writeheader()
        writer.writerows(dict_university_new)


__name__ == :
    i ():
        url = % (i)
        time.sleep()  html = request_pages(url)
        list_university = decode_pages(html)
        save_to_txt(list_university[]list_university[])
        save_to_csv(list_university[]list_university[])

2022-11-1 12:22

xander
超级版主
Rank: 12

UID 1
Digest 2
Points 2
Posts 169
码币MB 309 Code
黄金 0 Catty
钻石 903 Pellet
Permissions 200
Register 2022-2-7
Status offline

RE:python爬虫设计 --- Xpath 获取网页内容并使用head构造头信息，骗过网站。

requests
re
csv
lxml etree
time


():
    ua = {: }
    resp = requests.get(url=ua)
    (resp.status_code)
    html = resp.content.decode()
    html


(html):
    e_html = etree.HTML(html)
    title = e_html.xpath()

    university_index = e_html.xpath()

    university_name = e_html.xpath()

    university_location = e_html.xpath()

    university_score = e_html.xpath()

    university_score_other = e_html.xpath()

    list_university = []
    i ((university_index)):
        d1 = {
            : university_index: university_name: university_location: university_score: university_score_other[i * :i * + ]
        }
        list_university.append(d1)
    (% (title[]))
    list_universitytitle


(list_universitytitle):
    (.format(title[].strip())=) df:
        one list_university:
            df.write(json.dumps(one=) + )


(list_universitytitle):
    dict_university_new = []
    i list_university:
        temp = {
            : i[]: i[]: i[]: i[]: i[][]: i[][]: i[][]: i[][]: i[][]: i[][]: i[][]
        }
        dict_university_new.append(temp)
    (.format(title[].strip())==) cf:
        writer = csv.DictWriter(cf=[])
        writer.writeheader()
        writer.writerows(dict_university_new)


__name__ == :
    i ():
        url = % (i)
        time.sleep()  html = request_pages(url)
        list_university = decode_pages(html)
        save_to_txt(list_university[]list_university[])
        save_to_csv(list_university[]list_university[])

2022-11-1 12:23

xander
超级版主
Rank: 12

UID 1
Digest 2
Points 2
Posts 169
码币MB 309 Code
黄金 0 Catty
钻石 903 Pellet
Permissions 200
Register 2022-2-7
Status offline

RE:python爬虫设计 --- Xpath 获取网页内容并使用head构造头信息，骗过网站。

import requests
import re
import csv
from lxml import etree
import time

# 请求页面
def request_pages(urls):
    ua = {'User-Agent': 'User-Agent:Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'}
    resp = requests.get(url, headers=ua)
    print(resp.status_code)
    # print(resp.text) #直接打印文本
    # print(resp.content.decode('utf-8')) #二进制编码再转为gbk编码
    html = resp.content.decode('utf-8')
    return html

# 解析页面
def decode_pages(html):
    e_html = etree.HTML(html)
    # 1.提取标题
    title = e_html.xpath('//title/text()')

    # 2.提取大学排名表
    university_index = e_html.xpath('//div[@class="news-text"]//tbody/tr/td[1]/text()')

    # 3.提取大学名
    university_name = e_html.xpath('//div[@class="news-text"]//tbody/tr/td[2]/a/div/text()')

    # 4.提取地区
    university_location = e_html.xpath('//div[@class="news-text"]//tbody/tr/td[3]/text()')

    # 5.提取总分
    university_score = e_html.xpath('//div[@class="news-text"]//tbody/tr/td[4]/text()')

    # 6 提取其他分数
    university_score_other = e_html.xpath('//div[@class="news-text"]//tbody/tr/td[@class="hidden-xs"]/text()')

    # 存入列表
    list_university = []
    for i in range(len(university_index)):
        d1 = {
            '排名': university_index,
            '大学名称': university_name,
            '区域': university_location,
            '总分': university_score,
            '其他评分': university_score_other[i * 7:i * 7 + 7]
        }
        list_university.append(d1)
    print("解析成功:%s" % (title[0]))
    return list_university, title

# 保存txt
def save_to_txt(list_university, title):
    with open(r'e:{}.txt'.format(title[0].strip()), 'a', encoding='utf-8') as df:
        for one in list_university:
            df.write(json.dumps(one, ensure_ascii=False) + ' ')

# 保存csv
def save_to_csv(list_university, title):
    # 剥离其他评分
    dict_university_new = []
    for i in list_university:
        temp = {
            '排名': i['排名'],
            '大学名称': i['大学名称'],
            '区域': i['区域'],
            '总分': i['总分'],
            '研究生比例': i['其他评分'][0],
            '留学生比例': i['其他评分'][1],
            '师生比': i['其他评分'][2],
            '博士学位授权总量': i['其他评分'][3],
            '博士学位授权师均': i['其他评分'][4],
            '校友获奖总量': i['其他评分'][5],
            '校友获奖生均': i['其他评分'][6]
        }
        dict_university_new.append(temp)
    with open(r'e:{}.csv'.format(title[0].strip()), 'w', encoding='gbk', newline='') as cf:
        writer = csv.DictWriter(cf,
                                fieldnames=['排名', '大学名称', '区域', '总分', '研究生比例', '留学生比例', '师生比', '博士学位授权总量', '博士学位授权师均',
                                            '校友获奖总量', '校友获奖生均'])
        writer.writeheader()
        writer.writerows(dict_university_new)

if __name__ == '__main__':
    # 注意：2015以及以前的网页结构发生变化，需要修改解析规则
    for i in range(2016, 2020):
        url = "http://www.zuihaodaxue.com/Greater_China_Ranking%d_0.html" % (i)
        # url = 'http://www.zuihaodaxue.com/Greater_China_Ranking2019_0.html'
        time.sleep(2) # 间隔为2s
        html = request_pages(url)
        list_university = decode_pages(html)
        # print(list_university)
        save_to_txt(list_university[0], list_university[1])
        save_to_csv(list_university[0], list_university[1])

2022-11-1 12:23

Printable version | Recommend to a friend | Subscribe to topic | Favorite topic

All times are GMT+8, and the current time is 2026-1-14 00:06	Clear informations ->sessions/cookies - Contact Us - CAFFZ - ZAKE