import scrapy import re from items.BookItem import BookItem from database.connectors.BookConnector import BookConnector import logging class BooksSpider(scrapy.Spider): class BooksSpider(BookConnector, scrapy.Spider): name = 'books' start_urls = [ 'http://books.toscrape.com/catalogue/page-1.html' ] ITEM_PIPELINES = { 'tutorial.pipelines.PricePipeline': 1, } def parse(self, response): # follow links to book pages for href in response.css('div.image_container a::attr(href)'): # TODO yield response.follow(href, self.parse_book) # pagination next_page = response.css('li.next a::attr(href)').get() if next_page is not None: yield response.follow(next_page, callback=self.parse) def parse_book(self, response): book = BookItem() table_data = response.css('table td::text').getall() book.update({ 'url': response.url, 'title': response.css('div.product_main h1::text').get(), 'description': response.xpath("//*[@id='product_description']/following::p/text()").get(), 'image': response.urljoin(response.css('div.active img::attr(src)').get()), 'rating': self._get_rating(response.css('p.star-rating::attr(class)').get().split(' ')), 'upc': table_data[0], 'product_type': table_data[1], 'price_excl_tax': table_data[2], 'price_incl_tax': table_data[3], 'tax': table_data[4], 'in_stock': int(re.search(r'\d+', table_data[5]).group()), 'count_reviews': int(table_data[6]), }) yield book def _get_rating(self, class_all): if 'One' in class_all: return 1 if 'Two' in class_all: return 2 if 'Three' in class_all: return 3 if 'Four' in class_all: return 4 if 'Five' in class_all: return 5