Skip to content
Snippets Groups Projects
books_spider.py 2.14 KiB
Newer Older
from scrapy import Spider
Anton Gusev's avatar
Anton Gusev committed
import re
Anton Gusev's avatar
Anton Gusev committed
from items.BookItem import BookItem
from database.connectors.BookConnector import BookConnector
Anton Gusev's avatar
Anton Gusev committed

class BooksSpider(Spider, BookConnector):
Anton Gusev's avatar
Anton Gusev committed
    name = 'books'
Anton Gusev's avatar
Anton Gusev committed
    start_urls = ['http://books.toscrape.com/']
    page = 1
Anton Gusev's avatar
Anton Gusev committed

    def parse(self, response):
        self.logger.debug('Current page: {}'.format(self.page))

Anton Gusev's avatar
Anton Gusev committed
        # follow links to book pages
        for idx, href in enumerate(response.css('div.image_container a::attr(href)')):  # TODO delete enumerate
            yield response.follow(href, self.parse_book, meta={'idx': idx})
Anton Gusev's avatar
Anton Gusev committed

        # pagination
        next_page = response.css('li.next a::attr(href)').get()
        if next_page is not None:
            self.page += 1
Anton Gusev's avatar
Anton Gusev committed
            yield response.follow(next_page, callback=self.parse)

    def parse_book(self, response):
        self.logger.debug('Index book in page: {}'.format(response.meta.get('idx')))

Anton Gusev's avatar
Anton Gusev committed
        book = BookItem()

        table_data = response.css('table td::text').getall()
        book.update({
            'title': response.css('div.product_main h1::text').get(),
            'description': response.xpath("//*[@id='product_description']/following::p/text()").get(),
Anton Gusev's avatar
Anton Gusev committed
            'image_urls': [response.urljoin(response.css('div.active img::attr(src)').get())],
Anton Gusev's avatar
Anton Gusev committed
            'rating': self._get_rating(response.css('p.star-rating::attr(class)').get().split(' ')),
            'upc': table_data[0],
            'product_type': table_data[1],
Anton Gusev's avatar
Anton Gusev committed
            'price_excl_tax': table_data[2][1:],
            'price_incl_tax': table_data[3][1:],
            'tax': table_data[4][1:],
Anton Gusev's avatar
Anton Gusev committed
            'in_stock': int(re.search(r'\d+', table_data[5]).group()),
            'count_reviews': int(table_data[6]),
Anton Gusev's avatar
Anton Gusev committed
            'currency_type': table_data[2][0],
            'category': response.css('ul.breadcrumb li:nth-child(3) a::text').get()
Anton Gusev's avatar
Anton Gusev committed
        })
        yield book

    def _get_rating(self, class_all):
        if 'One' in class_all:
            return 1
        if 'Two' in class_all:
            return 2
        if 'Three' in class_all:
            return 3
        if 'Four' in class_all:
            return 4
        if 'Five' in class_all:
            return 5