腾讯新闻最新爬虫

科技前沿 • 2025-02-11 23:26 • 阅读 46
大家好，我是讯享网，很高兴认识大家。
import re import requests import csv import os from lxml import etree from urllib.parse import urljoin class TengXun(): def __init__(self): self.url = 'https://i.news..com/trpc.news_web.kv_srv.kv_srv_http_proxy/list?sub_srv_id=24hours&srv_id=pc&offset={0}&limit=20&strategy=1&ext={%22pool%22:[%22top%22],%22is_filter%22:7,%22check_type%22:true}' self.headers = { "accept": "*/*", "accept-encoding": "gzip, deflate, br", "accept-language": "zh-CN,zh;q=0.9", "origin": "https://news..com", "referer": "https://news..com/", "sec-ch-ua": "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"96\", \"Google Chrome\";v=\"96\"", "sec-ch-ua-mobile": "?0", "sec-ch-ua-platform": "\"Windows\"", "sec-fetch-dest": "empty", "sec-fetch-mode": "cors", "sec-fetch-site": "same-site", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36" } def data_get(self): srv_id = ['24hours','ent','milite','world','tech','finance'] for srv in srv_id: ur = 'https://i.news..com/trpc.news_web.kv_srv.kv_srv_http_proxy/list?sub_srv_id='+srv for num in range(20,200,20): print(srv,num,'*'*50) #https://i.news..com/trpc.news_web.kv_srv.kv_srv_http_proxy/list?sub_srv_id=ent&srv_id=pc&offset=20&limit=20&strategy=1&ext={%22pool%22:[%22top%22],%22is_filter%22:10,%22check_type%22:true} url = ur+'&srv_id=pc&offset='+str(num)+'&limit=20&strategy=1&ext={%22pool%22:[%22top%22],%22is_filter%22:7,%22check_type%22:true}' resp = requests.get(url,headers=self.headers).json() d_list = resp['data']['list'] if not d_list: break for d in d_list: title = d['title'] href = d['url'] print(title,href) self.detail(title,href) def detail(self,title,url): resp = requests.get(url,headers=self.headers) # media = re.findall ( r'"media":(.*?),', resp.text, re.S ) result = re.findall ( r'"pubtime":(.*?),', resp.text, re.S )#.replace(')','').replace("'",'').split(',') if not result: result = '无' else: result = result[0].replace('"','') resp.encoding =resp.apparent_encoding tree = etree.HTML(resp.text) p_text = ''.join(tree.xpath('//p[@class="one-p"]//text()')).replace(' ','').replace('\n','') source = '腾讯' self.save(url,title,p_text,result,source) def save(self,url,title,p_text,result,source): with open('新闻.txt','a',encoding='utf-8')as f: f.write(url+'\n') with open('新闻.txt','a',encoding='utf-8')as f: f.write(title+'\n') with open('新闻.txt','a',encoding='utf-8')as f: f.write(p_text+'\n') with open('新闻.txt','a',encoding='utf-8')as f: f.write(result+'\n') with open('新闻.txt','a',encoding='utf-8')as f: f.write(source+'\n') if __name__=="__main__": t = TengXun() t.data_get()
讯享网
腾讯新闻最新爬虫

相关推荐