import scrapy import re from items.BookItem import BookItem from database.connectors.BookConnector import BookConnector import logging class BooksSpider(BookConnector, scrapy.Spider): name = 'books' start_urls = ['http://books.toscrape.com/'] def parse(self, response): # follow links to book pages for idx, href in enumerate(response.css('div.image_container a::attr(href)')): # TODO delete enumerate yield response.follow(href, self.parse_book) # pagination next_page = response.css('li.next a::attr(href)').get() if next_page is not None: yield response.follow(next_page, callback=self.parse) def parse_book(self, response): book = BookItem() table_data = response.css('table td::text').getall() book.update({ 'title': response.css('div.product_main h1::text').get(), 'description': response.xpath("//*[@id='product_description']/following::p/text()").get(), 'image_urls': [response.urljoin(response.css('div.active img::attr(src)').get())], 'rating': self._get_rating(response.css('p.star-rating::attr(class)').get().split(' ')), 'upc': table_data[0], 'product_type': table_data[1], 'price_excl_tax': table_data[2][1:], 'price_incl_tax': table_data[3][1:], 'tax': table_data[4][1:], 'in_stock': int(re.search(r'\d+', table_data[5]).group()), 'count_reviews': int(table_data[6]), 'currency_type': table_data[2][0], 'category': response.css('ul.breadcrumb li:nth-child(3) a::text').get() }) yield book def _get_rating(self, class_all): if 'One' in class_all: return 1 if 'Two' in class_all: return 2 if 'Three' in class_all: return 3 if 'Four' in class_all: return 4 if 'Five' in class_all: return 5