books_spider.py

import scrapy
import re
from items.BookItem import BookItem


class BooksSpider(scrapy.Spider):
    name = 'books'
    start_urls = [
        'http://books.toscrape.com/catalogue/page-1.html'
    ]
    ITEM_PIPELINES = {
        'tutorial.pipelines.PricePipeline': 1,
    }

    def parse(self, response):
        # follow links to book pages
        for href in response.css('div.image_container a::attr(href)'):  # TODO
            yield response.follow(href, self.parse_book)

        # pagination
        next_page = response.css('li.next a::attr(href)').get()
        if next_page is not None:
            yield response.follow(next_page, callback=self.parse)

    def parse_book(self, response):
        book = BookItem()

        table_data = response.css('table td::text').getall()
        book.update({
            'url': response.url,
            'title': response.css('div.product_main h1::text').get(),
            'description': response.xpath("//*[@id='product_description']/following::p/text()").get(),
            'image': response.urljoin(response.css('div.active img::attr(src)').get()),
            'rating': self._get_rating(response.css('p.star-rating::attr(class)').get().split(' ')),
            'upc': table_data[0],
            'product_type': table_data[1],
            'price_excl_tax': table_data[2],
            'price_incl_tax': table_data[3],
            'tax': table_data[4],
            'in_stock': int(re.search(r'\d+', table_data[5]).group()),
            'count_reviews': int(table_data[6]),
        })
        yield book

    def _get_rating(self, class_all):
        if 'One' in class_all:
            return 1
        if 'Two' in class_all:
            return 2
        if 'Three' in class_all:
            return 3
        if 'Four' in class_all:
            return 4
        if 'Five' in class_all:
            return 5