爬虫—dy直播各个类别下直播数据

爬虫—dy直播各个类别下直播数据附代码 直播首页数据 import requests import flask from bs4 import BeautifulSou from lxml import etree def getInfoUtils info if info return info 0 else

大家好,我是讯享网,很高兴认识大家。

附代码

直播首页数据

import requests import flask from bs4 import BeautifulSoup from lxml import etree def getInfoUtils(info): if info: return info[0] else: '' def getHtml(url): response = requests.get(url) try: if response.status_code == 200: content = response.text return content except Exception as e: return e def htmlToTree(html): tree = etree.HTML(html) return tree def parseLi(li): # content_dict = {} li_tree = htmlToTree(li) href = "https://www.douyu.com" + li_tree.xpath( '//div[@class="DyListCover HeaderCell is-href"]/a[1]/@href')[0] a_element = li_tree.xpath('//div[@class="DyListCover HeaderCell is-href"]/a[1]')[0] a_tag = etree.tostring(a_element, pretty_print=True, encoding='utf-8').decode('utf-8') a_tree = htmlToTree(a_tag) dy_cover = a_tree.xpath( '//div[@class ="DyListCover-content"]/div[@class="DyListCover-info"]') content1 = etree.tostring(dy_cover[0], pretty_print=True, encoding='utf-8').decode( 'utf-8') live_type = htmlToTree(content1).xpath('//span/text()') live_type = getInfoUtils(live_type) live_title = htmlToTree(content1).xpath('//h3/text()') live_title = getInfoUtils(live_title) content2 = etree.tostring(dy_cover[1], pretty_print=True, encoding='utf-8').decode( 'utf-8') live_hot = htmlToTree(content2).xpath('//span/text()') live_hot = getInfoUtils(live_hot) live_name = htmlToTree(content2).xpath('//h2/div/text()') live_name = getInfoUtils(live_name) content_dict = { 
    'live_link': href, 'live_type': live_type, 'live_title': live_title, 'live_hot': live_hot, 'live_name': live_name } return content_dict def parseHtml(html): res_data = [] tree = etree.HTML(html) path = '//main[@class="layout-Main"]//div[@class="layout-Module-container layout-Cover ' \ 'ListContent"]/ul/li' data = tree.xpath(path) for item in data: item = etree.tostring(item, pretty_print=True, encoding='utf-8').decode('utf-8') res = parseLi(item) res_data.append(res) return res_data def liveMain(url): s = parseHtml(getHtml(url)) return s if __name__ == '__main__': url = 'https://www.douyu.com/directory/all' data = liveMain(url) print(data) 
讯享网

分类下的直播数据

讯享网import requests from lxml import etree from douyuLive import liveMain def getHtml(url): response = requests.get(url) try: if response.status_code == 200: content = response.text return content except Exception as e: return e def htmlToTree(html): tree = etree.HTML(html) return tree def parseCate(category): category_dict = { 
   } category_list = [] category_tree = htmlToTree(category) category_title = category_tree.xpath('//div[@class="categoryBox-head"]/h4/text()') category_hrefs = category_tree.xpath('//ul[@class="layout-Classify-list"]/li/a/@href') category_items = category_tree.xpath('//ul[@class="layout-Classify-list"]/li/a/strong/text()') category_tatolhots = category_tree.xpath( '//ul[@class="layout-Classify-list"]/li/a/div/span/text()') if len(category_items) == len(category_tatolhots) == len(category_hrefs): for i in range(len(category_items)): info = { 
   } info['category_item'] = category_items[i] info['category_href'] = "https://www.douyu.com" + category_hrefs[i] info['category_tatolhot'] = category_tatolhots[i] category_list.append(info) category_dict["category_title"] = category_title[0] category_dict["category_list"] = category_list # print(category_dict) return category_dict def parseHtml(html): data_info = [] tree = htmlToTree(html) all_categories = tree.xpath('//main[@id="allCate"]/section[@class="layout-Module"]/div') for i in range(2, len(all_categories)): category = etree.tostring(all_categories[i], pretty_print=True, encoding='utf-8').decode('utf-8') data = parseCate(category) data_info.append(data) return data_info def categoryMain(url):
    html = getHtml(url=url) data = parseHtml(html) return data def main(data): for item in data: category_list = item["category_list"] for hrefs in category_list: category_href = hrefs["category_href"] info = liveMain(category_href) print("*", hrefs["category_item"]) print(info) if __name__ == '__main__': url = "https://www.douyu.com/directory" data = categoryMain(url) print(data) main(data) 
小讯
上一篇 2025-03-13 19:27
下一篇 2025-01-13 17:00

相关推荐

版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容,请联系我们,一经查实,本站将立刻删除。
如需转载请保留出处:https://51itzy.com/kjqy/47714.html