From 6c10a0620b2506b1aa8ad42440b47133e109113a Mon Sep 17 00:00:00 2001 From: Gusev Anton <gusev_aa@groupbwt.com> Date: Thu, 13 Jun 2019 17:19:06 +0300 Subject: [PATCH] add logging.debug for BooksSpider --- src/spiders/books_spider.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/spiders/books_spider.py b/src/spiders/books_spider.py index d8c4c13..11dc3b5 100644 --- a/src/spiders/books_spider.py +++ b/src/spiders/books_spider.py @@ -1,25 +1,30 @@ -import scrapy +from scrapy import Spider import re from items.BookItem import BookItem from database.connectors.BookConnector import BookConnector -import logging -class BooksSpider(BookConnector, scrapy.Spider): +class BooksSpider(Spider, BookConnector): name = 'books' start_urls = ['http://books.toscrape.com/'] + page = 1 def parse(self, response): + self.logger.debug('Current page: {}'.format(self.page)) + # follow links to book pages - for idx, href in enumerate(response.css('div.image_container a::attr(href)')): # TODO delete enumerate - yield response.follow(href, self.parse_book) + for idx, href in enumerate(response.css('div.image_container a::attr(href)')): # TODO delete enumerate + yield response.follow(href, self.parse_book, meta={'idx': idx}) # pagination next_page = response.css('li.next a::attr(href)').get() if next_page is not None: + self.page += 1 yield response.follow(next_page, callback=self.parse) def parse_book(self, response): + self.logger.debug('Index book in page: {}'.format(response.meta.get('idx'))) + book = BookItem() table_data = response.css('table td::text').getall() @@ -50,4 +55,4 @@ class BooksSpider(BookConnector, scrapy.Spider): if 'Four' in class_all: return 4 if 'Five' in class_all: - return 5 + return 5 \ No newline at end of file -- GitLab