Skip to content
Snippets Groups Projects
books_spider.py 1.82 KiB
Newer Older
Anton Gusev's avatar
Anton Gusev committed
import scrapy
import re
Anton Gusev's avatar
Anton Gusev committed
from items.BookItem import BookItem

Anton Gusev's avatar
Anton Gusev committed

class BooksSpider(scrapy.Spider):
    name = 'books'
    start_urls = [
        'http://books.toscrape.com/catalogue/page-1.html'
    ]
    ITEM_PIPELINES = {
        'tutorial.pipelines.PricePipeline': 1,
    }

    def parse(self, response):
        # follow links to book pages
        for href in response.css('div.image_container a::attr(href)'):  # TODO
            yield response.follow(href, self.parse_book)

        # pagination
        next_page = response.css('li.next a::attr(href)').get()
        if next_page is not None:
            yield response.follow(next_page, callback=self.parse)

    def parse_book(self, response):
        book = BookItem()

        table_data = response.css('table td::text').getall()
        book.update({
            'url': response.url,
            'title': response.css('div.product_main h1::text').get(),
            'description': response.xpath("//*[@id='product_description']/following::p/text()").get(),
            'image': response.urljoin(response.css('div.active img::attr(src)').get()),
            'rating': self._get_rating(response.css('p.star-rating::attr(class)').get().split(' ')),
            'upc': table_data[0],
            'product_type': table_data[1],
            'price_excl_tax': table_data[2],
            'price_incl_tax': table_data[3],
            'tax': table_data[4],
            'in_stock': int(re.search(r'\d+', table_data[5]).group()),
            'count_reviews': int(table_data[6]),
        })
        yield book

    def _get_rating(self, class_all):
        if 'One' in class_all:
            return 1
        if 'Two' in class_all:
            return 2
        if 'Three' in class_all:
            return 3
        if 'Four' in class_all:
            return 4
        if 'Five' in class_all:
            return 5