Skip to content
Snippets Groups Projects
books_spider.py 2.21 KiB
Newer Older
Anton Gusev's avatar
Anton Gusev committed
from scrapy import Spider
import re
from items.BookItem import BookItem
from database.connectors.BookConnector import BookConnector
Anton Gusev's avatar
Anton Gusev committed
from database.connectors.AltBookConnector import AltBookConnector
Anton Gusev's avatar
Anton Gusev committed


class BooksSpider(Spider, BookConnector):
    name = 'books'
    start_urls = ['http://books.toscrape.com/']
    page = 1

    def parse(self, response):
        self.logger.debug('Current page: {}'.format(self.page))

        # follow links to book pages
        for idx, href in enumerate(response.css('div.image_container a::attr(href)')):
            yield response.follow(href, self.parse_book, meta={'idx': idx})

        # pagination
        next_page = response.css('li.next a::attr(href)').get()
        if next_page is not None:
            self.page += 1
            yield response.follow(next_page, callback=self.parse)

    def parse_book(self, response):
        self.logger.debug('Index book in page: {}'.format(response.meta.get('idx')))

        book = BookItem()

        table_data = response.css('table td::text').getall()
        book.update({
            'title': response.css('div.product_main h1::text').get(),
            'description': response.xpath("//*[@id='product_description']/following::p/text()").get(),
            'image_urls': [response.urljoin(response.css('div.active img::attr(src)').get())],
            'rating': self._get_rating(response.css('p.star-rating::attr(class)').get().split(' ')),
            'upc': table_data[0],
            'product_type': table_data[1],
            'price_excl_tax': table_data[2][1:],
            'price_incl_tax': table_data[3][1:],
            'tax': table_data[4][1:],
            'in_stock': int(re.search(r'\d+', table_data[5]).group()),
            'count_reviews': int(table_data[6]),
            'currency_type': table_data[2][0],
            'category': response.css('ul.breadcrumb li:nth-child(3) a::text').get(),
            'url': response.url,
        })
        yield book

    def _get_rating(self, class_all):
        if 'One' in class_all:
            return 1
        if 'Two' in class_all:
            return 2
        if 'Three' in class_all:
            return 3
        if 'Four' in class_all:
            return 4
        if 'Five' in class_all:
Anton Gusev's avatar
Anton Gusev committed
            return 5