1、爬虫主文件kuqin.py
# -*- coding: utf-8 -*- import scrapy import re from scrapy.http import Request from urllib import parse from ArticleSpider.items import ArticelItem class KuqinSpider(scrapy.Spider): name = 'kuqin' allowed_domains = ['www.kuqin.com'] start_urls = ['http://www.kuqin.com'] def parse(self, response): post_urls = response.css(".list-boxes h2 a::attr(href)").extract() for post_url in post_urls: post_url = parse.urljoin(response.url, post_url) print(post_url) yield Request(url=post_url, callback=self.parse_detail) next_url = response.xpath("//div[@class='pagination']/ul/li[10]/a/@href").extract()[0] if next_url: next_url = parse.urljoin(response.url, next_url) print(next_url) yield Request(url=next_url, callback=self.parse) def parse_detail(self, response): article_item = ArticelItem() title = response.xpath("//div[@class='tc-box first-box article-box']/h2/text()").extract()[0] create_date = response.xpath("//div[@class='article-infobox']/span/text()").extract()[0] create_date = re.match("(.*)\s", create_date).group(1).strip() author = response.xpath("//div[@class='kq__article-power']/p/text()").extract()[1] content = response.xpath("//div[@id='article_content']").extract()[0] print(title) print(create_date) print(author) print(content) article_item["title"] = title article_item["create_date"] = create_date article_item["author"] = author article_item["content"] = content yield article_item
讯享网
2、items.py
讯享网class ArticelItem(scrapy.Item): title=scrapy.Field() create_date=scrapy.Field() author=scrapy.Field() content=scrapy.Field()
3、pipelines.py
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html import MySQLdb import MySQLdb.cursors import codecs import json from twisted.enterprise import adbapi from scrapy.exporters import JsonItemExporter class ArticlespiderPipeline(object): def process_item(self, item, spider): return item # 自定义json文件导出 class JsonWithEncodingPipeline(object): def __init__(self): self.file = codecs.open("acticle.json", "w", encoding="utf-8") def process_item(self, item, spider): lines = json.dumps(dict(item), ensure_ascii=False) + "\n" self.file.write(lines) return item def spider_closed(self, spider): self.file.close() # 调用scrapy提供的json export导出json文件 class JsonExporterPipleline(object): def __init__(self): self.file = open("articlexport.json", "wb") self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item # 同步存储 class MysqlPipeline(object): def __init__(self): self.conn = MySQLdb.connect('127.0.0.1', 'root', 'root', 'article_spider', charset='utf8', use_unicode=True) self.cursor = self.conn.cursor() def process_item(self, item, spider): insert_sql = """ insert into article(title,create_date,author,content) values(%s,%s,%s,%s) """ self.cursor.execute(insert_sql, (item["title"], item["create_date"], item["author"], item["content"])) self.conn.commit() # 异步存储 class MysqlTwistedPipline(object): def __init__(self, dbpool): self.dbpool = dbpool @classmethod def from_settings(cls, settings): dbarms = dict( host=settings["MYSQL_HOST"], db=settings["MYSQL_DBNAME"], user=settings["MYSQL_USER"], password=settings["MYSQL_PASSWORD"], charset='utf8', cursorclass=MySQLdb.cursors.DictCursor, use_unicode=True, ) dbpool = adbapi.ConnectionPool("MySQLdb", dbarms) return cls(dbpool) def process_item(self, item, spider): query = self.dbpool.runInteraction(self.do_insert, item) query.addErrback(self.handle_error, item, spider) def handle_error(self, failure, item, spider): print(failure) def do_insert(self, cursor, item): insert_sql = """ insert into article(title,create_date,author,content) values(%s,%s,%s,%s) """ cursor.execute(insert_sql, (item["title"], item["create_date"], item["author"], item["content"]))
4、settings.py
讯享网ROBOTSTXT_OBEY = False ITEM_PIPELINES = {
# 'ArticleSpider.pipelines.ArticlespiderPipeline': 300, # 'ArticleSpider.pipelines.MysqlPipeline': 290, # 'ArticleSpider.pipelines.MysqlTwistedPipline': 200, # 'ArticleSpider.pipelines.JsonWithEncodingPipeline': 200, 'ArticleSpider.pipelines.JsonExporterPipleline': 200, } MYSQL_HOST='127.0.0.1' MYSQL_DBNAME="article_spider" MYSQL_USER='root' MYSQL_PASSWORD='root'
项目链接

版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容,请联系我们,一经查实,本站将立刻删除。
如需转载请保留出处:https://51itzy.com/kjqy/37969.html