Skip to content
Snippets Groups Projects

Develop

Merged Anton Gusev requested to merge develop into master
1 file
+ 11
6
Compare changes
  • Side-by-side
  • Inline
import scrapy
from scrapy import Spider
import re
from items.BookItem import BookItem
from database.connectors.BookConnector import BookConnector
import logging
class BooksSpider(BookConnector, scrapy.Spider):
class BooksSpider(Spider, BookConnector):
name = 'books'
start_urls = ['http://books.toscrape.com/']
page = 1
def parse(self, response):
self.logger.debug('Current page: {}'.format(self.page))
# follow links to book pages
for idx, href in enumerate(response.css('div.image_container a::attr(href)')): # TODO delete enumerate
yield response.follow(href, self.parse_book)
for idx, href in enumerate(response.css('div.image_container a::attr(href)')): # TODO delete enumerate
yield response.follow(href, self.parse_book, meta={'idx': idx})
# pagination
next_page = response.css('li.next a::attr(href)').get()
if next_page is not None:
self.page += 1
yield response.follow(next_page, callback=self.parse)
def parse_book(self, response):
self.logger.debug('Index book in page: {}'.format(response.meta.get('idx')))
book = BookItem()
table_data = response.css('table td::text').getall()
@@ -50,4 +55,4 @@ class BooksSpider(BookConnector, scrapy.Spider):
if 'Four' in class_all:
return 4
if 'Five' in class_all:
return 5
return 5
\ No newline at end of file
Loading