Skip to content
Snippets Groups Projects
books_spider.py 1.95 KiB
Newer Older
Anton Gusev's avatar
Anton Gusev committed
import scrapy
import re
Anton Gusev's avatar
Anton Gusev committed
from items.BookItem import BookItem
from database.connectors.BookConnector import BookConnector
import logging
Anton Gusev's avatar
Anton Gusev committed

class BooksSpider(BookConnector, scrapy.Spider):
Anton Gusev's avatar
Anton Gusev committed
    name = 'books'
Anton Gusev's avatar
Anton Gusev committed
    start_urls = ['http://books.toscrape.com/']
Anton Gusev's avatar
Anton Gusev committed

    def parse(self, response):
        # follow links to book pages
Anton Gusev's avatar
Anton Gusev committed
        for idx, href in enumerate(response.css('div.image_container a::attr(href)')): # TODO delete enumerate
                yield response.follow(href, self.parse_book)
Anton Gusev's avatar
Anton Gusev committed

        # pagination
        next_page = response.css('li.next a::attr(href)').get()
        if next_page is not None:
            yield response.follow(next_page, callback=self.parse)

    def parse_book(self, response):
        book = BookItem()

        table_data = response.css('table td::text').getall()
        book.update({
            'title': response.css('div.product_main h1::text').get(),
            'description': response.xpath("//*[@id='product_description']/following::p/text()").get(),
Anton Gusev's avatar
Anton Gusev committed
            'image_urls': [response.urljoin(response.css('div.active img::attr(src)').get())],
Anton Gusev's avatar
Anton Gusev committed
            'rating': self._get_rating(response.css('p.star-rating::attr(class)').get().split(' ')),
            'upc': table_data[0],
            'product_type': table_data[1],
Anton Gusev's avatar
Anton Gusev committed
            'price_excl_tax': table_data[2][1:],
            'price_incl_tax': table_data[3][1:],
            'tax': table_data[4][1:],
Anton Gusev's avatar
Anton Gusev committed
            'in_stock': int(re.search(r'\d+', table_data[5]).group()),
            'count_reviews': int(table_data[6]),
Anton Gusev's avatar
Anton Gusev committed
            'currency_type': table_data[2][0],
            'category': response.css('ul.breadcrumb li:nth-child(3) a::text').get()
Anton Gusev's avatar
Anton Gusev committed
        })
        yield book

    def _get_rating(self, class_all):
        if 'One' in class_all:
            return 1
        if 'Two' in class_all:
            return 2
        if 'Three' in class_all:
            return 3
        if 'Four' in class_all:
            return 4
        if 'Five' in class_all:
            return 5