Skip to content
Snippets Groups Projects
books_spider.py 1.94 KiB
Newer Older
Anton Gusev's avatar
Anton Gusev committed
import scrapy
import re
Anton Gusev's avatar
Anton Gusev committed
from items.BookItem import BookItem
from database.connectors.BookConnector import BookConnector
import logging
Anton Gusev's avatar
Anton Gusev committed

class BooksSpider(scrapy.Spider):
class BooksSpider(BookConnector, scrapy.Spider):
Anton Gusev's avatar
Anton Gusev committed
    name = 'books'
    start_urls = [
        'http://books.toscrape.com/catalogue/page-1.html'
    ]
    ITEM_PIPELINES = {
        'tutorial.pipelines.PricePipeline': 1,
    }

    def parse(self, response):
        # follow links to book pages
        for href in response.css('div.image_container a::attr(href)'):  # TODO
            yield response.follow(href, self.parse_book)

        # pagination
        next_page = response.css('li.next a::attr(href)').get()
        if next_page is not None:
            yield response.follow(next_page, callback=self.parse)

    def parse_book(self, response):
        book = BookItem()

        table_data = response.css('table td::text').getall()
        book.update({
            'url': response.url,
            'title': response.css('div.product_main h1::text').get(),
            'description': response.xpath("//*[@id='product_description']/following::p/text()").get(),
            'image': response.urljoin(response.css('div.active img::attr(src)').get()),
            'rating': self._get_rating(response.css('p.star-rating::attr(class)').get().split(' ')),
            'upc': table_data[0],
            'product_type': table_data[1],
            'price_excl_tax': table_data[2],
            'price_incl_tax': table_data[3],
            'tax': table_data[4],
            'in_stock': int(re.search(r'\d+', table_data[5]).group()),
            'count_reviews': int(table_data[6]),
        })
        yield book

    def _get_rating(self, class_all):
        if 'One' in class_all:
            return 1
        if 'Two' in class_all:
            return 2
        if 'Three' in class_all:
            return 3
        if 'Four' in class_all:
            return 4
        if 'Five' in class_all:
            return 5