本文共 3418 字,大约阅读时间需要 11 分钟。
安装
pip install pypiwin32
pip install
pip install scrapy
创建和运行项目
scrapy startproject qsbk #创建项目scrapy genspider qsbk_spider "qiushibaike.com" #创建爬虫scrapy crawl qsbk_spider #运行爬虫
代码
qsbk_spider.py
# -*- coding: utf-8 -*-'''想要学习Python?Python学习交流群:973783996满足你的需求,资料都已经上传群文件,可以自行下载!'''import scrapyfrom qsbk.items import QsbkItemclass QsbkSpiderSpider(scrapy.Spider): name = 'qsbk_spider' allowed_domains = ['qiushibaike.com'] start_urls = ['https://www.qiushibaike.com/8hr/page/1/'] base_domain = "https://www.qiushibaike.com" def parse(self, response): duanzidivs = response.xpath("//div[@id='content-left']/div") for duanzidiv in duanzidivs: author = duanzidiv.xpath(".//h2/text()").get().strip() content = duanzidiv.xpath(".//div[@class='content']//text()").getall() content = "".join(content).strip() item = QsbkItem(author=author,content=content) yield item #爬后面页的数据 next_url = response.xpath("//ul[@class='pagination']/li[last()]/a/@href").get() if not next_url: return else: yield scrapy.Request(self.base_domain+next_url,callback=self.parse)
item.py
import scrapyclass QsbkItem(scrapy.Item): author = scrapy.Field() content = scrapy.Field()
pipelines.py
# -*- coding: utf-8 -*-import json'''想要学习Python?Python学习交流群:973783996满足你的需求,资料都已经上传群文件,可以自行下载!'''#1.手动把dick转换成json格式# class QsbkPipeline(object):# def __init__(self):# self.fp = open('duanzi.json','w',encoding='utf-8')## def open_spider(self,spider):# print('开始爬虫')## def process_item(self, item, spider):# item_json = json.dumps(dict(item),ensure_ascii=False)# self.fp.write(item_json+'\n')# return item## def close_spider(self,spider):# self.fp.close()# print('爬虫结束了')#2.适用JsonItemExporter,使用与数据量小的情况下# from scrapy.exporters import JsonItemExporter# class QsbkPipeline(object):# def __init__(self):# self.fp = open('duanzi.json','wb')# self.exporter = JsonItemExporter(self.fp,ensure_ascii=False,encoding='utf-8')# self.exporter.start_exporting()## def open_spider(self,spider):# print('开始爬虫')## def process_item(self, item, spider):# self.exporter.export_item(item)# return item## def close_spider(self,spider):# self.exporter.finish_exporting()# self.fp.close()# print('爬虫结束了')#3.JsonLinesItemExporter,适用与数据量大的情况下from scrapy.exporters import JsonLinesItemExporterclass QsbkPipeline(object): def __init__(self): self.fp = open('duanzi.json','wb') self.exporter = JsonLinesItemExporter(self.fp,ensure_ascii=False,encoding='utf-8') def open_spider(self,spider): print('开始爬虫') def process_item(self, item, spider): self.exporter.export_item(item) return item def close_spider(self,spider): self.fp.close() print('爬虫结束了')
settings.py
ROBOTSTXT_OBEY = FalseDOWNLOAD_DELAY = 1DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36',}ITEM_PIPELINES = { 'qsbk.pipelines.QsbkPipeline': 300,}
start.py
from scrapy import cmdlinecmdline.execute("scrapy crawl qsbk_spider".split())
转载地址:http://tinmf.baihongyu.com/