xander
超级版主
  
UID 1
Digest
2
Points 2
Posts 169
码币MB 309 Code
黄金 0 Catty
钻石 903 Pellet
Permissions 200
Register 2022-2-7
Status offline
|
RE:python爬虫设计 --- Xpath 获取网页内容并使用head构造头信息,骗过网站。
import requests import re import csv from lxml import etree import time
# 请求页面 def request_pages(urls): ua = {'User-Agent': 'User-Agent:Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'} resp = requests.get(url, headers=ua) print(resp.status_code) # print(resp.text) #直接打印文本 # print(resp.content.decode('utf-8')) #二进制编码 再转为gbk编码 html = resp.content.decode('utf-8') return html
# 解析页面 def decode_pages(html): e_html = etree.HTML(html) # 1.提取标题 title = e_html.xpath('//title/text()')
# 2.提取大学排名表 university_index = e_html.xpath('//div[@class="news-text"]//tbody/tr/td[1]/text()')
# 3.提取大学名 university_name = e_html.xpath('//div[@class="news-text"]//tbody/tr/td[2]/a/div/text()')
# 4.提取地区 university_location = e_html.xpath('//div[@class="news-text"]//tbody/tr/td[3]/text()')
# 5.提取总分 university_score = e_html.xpath('//div[@class="news-text"]//tbody/tr/td[4]/text()')
# 6 提取其他分数 university_score_other = e_html.xpath('//div[@class="news-text"]//tbody/tr/td[@class="hidden-xs"]/text()')
# 存入列表 list_university = [] for i in range(len(university_index)): d1 = { '排名': university_index, '大学名称': university_name, '区域': university_location, '总分': university_score, '其他评分': university_score_other[i * 7:i * 7 + 7] } list_university.append(d1) print("解析成功:%s" % (title[0])) return list_university, title
# 保存txt def save_to_txt(list_university, title): with open(r'e:{}.txt'.format(title[0].strip()), 'a', encoding='utf-8') as df: for one in list_university: df.write(json.dumps(one, ensure_ascii=False) + '
')
# 保存csv def save_to_csv(list_university, title): # 剥离其他评分 dict_university_new = [] for i in list_university: temp = { '排名': i['排名'], '大学名称': i['大学名称'], '区域': i['区域'], '总分': i['总分'], '研究生比例': i['其他评分'][0], '留学生比例': i['其他评分'][1], '师生比': i['其他评分'][2], '博士学位授权总量': i['其他评分'][3], '博士学位授权师均': i['其他评分'][4], '校友获奖总量': i['其他评分'][5], '校友获奖生均': i['其他评分'][6] } dict_university_new.append(temp) with open(r'e:{}.csv'.format(title[0].strip()), 'w', encoding='gbk', newline='') as cf: writer = csv.DictWriter(cf, fieldnames=['排名', '大学名称', '区域', '总分', '研究生比例', '留学生比例', '师生比', '博士学位授权总量', '博士学位授权师均', '校友获奖总量', '校友获奖生均']) writer.writeheader() writer.writerows(dict_university_new)
if __name__ == '__main__': # 注意:2015以及以前的网页结构发生变化,需要修改解析规则 for i in range(2016, 2020): url = "http://www.zuihaodaxue.com/Greater_China_Ranking%d_0.html" % (i) # url = 'http://www.zuihaodaxue.com/Greater_China_Ranking2019_0.html' time.sleep(2) # 间隔为2s html = request_pages(url) list_university = decode_pages(html) # print(list_university) save_to_txt(list_university[0], list_university[1]) save_to_csv(list_university[0], list_university[1])
|
|