class QuotesSpider(scrapy.Spider):
# we are inherting from Spider base class
name = “quotes_spider”
# this is the attribute of the spider class
def start_request(self):
urls = ["http://quotes.toscrape.com/page/1/",
"http://quotes.toscrape.com/page/2/",
"http://quotes.toscrape.com/page/3/"]
# this allows to make get/post request to url
# give list of url from which to get the data
#generator function
for url in urls:
# we need to do a request method similar to getting API on this url and when response comes
# we need to define a callback method
yield scrapy.Request(url = url, callback = self.parse)
#we need to build a function that knows what to do when the response comes
def parse(self, response):
# to see from which page response is coming - which gives us the id of the url on which request ws done
page_id = response.url.split("/")[-2]
# returns the list split by ? and we get the second last elemt to get the page id of the url
#save this response in a html file
filename = "quotes-%s"%page_id
with open(filename, 'wb') as f:
f.write(response.body)
self.log('saved file %s' % filename)
#save in our main directory
from scrapy.crawler import CrawlerProcess
process = CrawlerProcess()
process.crawl(QuotesSpider)
process.start()
this is throwing “ReactorNotRestartable” error