I was able to make a csv file and successfully submit, but am unable to save the images using scrapy. I tried to do something like we do using beautiful soup, ie, i made the get request using image url and the save the body of the response and wrote it as binary in a jpg file. But it’s size is 0.
The code looks like this:
import scrapy
import pandas as pd
class BookstoreSpider(scrapy.Spider):
name=“Bookspider”
filename=“Bookstore_Scrapping.csv”
books_dir={
“image_url”: [],
“book_title”: [],
“product_price”: []
}
def start_requests(self):
url=“http://books.toscrape.com/”
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
for idx,book in enumerate(response.css(“article.product_pod”)):
book_image=book.css(“div.image_container a img::attr(src)”).get()
book_title=book.css(“h3 a::attr(title)”).get()
book_cost=book.css(“div.product_price p.price_color::text”).get()
image=scrapy.Request(response.urljoin(book_image))
with open (“Scraped images/BookIndex:”+str(idx)+".jpg", “wb”) as imgfile:
imgfile.write(image.body)
self.log(“Saved%s”%book_title)
self.books_dir[“image_url”].append(book_image)
self.books_dir[“book_title”].append(book_title)
self.books_dir[“product_price”].append(book_cost)
df=pd.DataFrame(self.books_dir)
df.to_csv(self.filename, index=False)
self.log(“Done%s”%response.url)
next_page=response.css(“li.next a::attr(href)”).get()
if next_page is not None:
next_page=response.urljoin(next_page)
yield scrapy.Request(url=next_page, callback=self.parse)