"src/api/blog/controllers/blog.js" did not exist on "82028ae959a1ce6cea2a03dceeacd2ea3cd6441b"
Newer
Older
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
class BooksSpider(scrapy.Spider):
name = 'books'
start_urls = [
'http://books.toscrape.com/catalogue/page-1.html'
]
ITEM_PIPELINES = {
'tutorial.pipelines.PricePipeline': 1,
}
def parse(self, response):
# follow links to book pages
for href in response.css('div.image_container a::attr(href)'): # TODO
yield response.follow(href, self.parse_book)
# pagination
next_page = response.css('li.next a::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
def parse_book(self, response):
book = BookItem()
table_data = response.css('table td::text').getall()
book.update({
'url': response.url,
'title': response.css('div.product_main h1::text').get(),
'description': response.xpath("//*[@id='product_description']/following::p/text()").get(),
'image': response.urljoin(response.css('div.active img::attr(src)').get()),
'rating': self._get_rating(response.css('p.star-rating::attr(class)').get().split(' ')),
'upc': table_data[0],
'product_type': table_data[1],
'price_excl_tax': table_data[2],
'price_incl_tax': table_data[3],
'tax': table_data[4],
'in_stock': int(re.search(r'\d+', table_data[5]).group()),
'count_reviews': int(table_data[6]),
})
yield book
def _get_rating(self, class_all):
if 'One' in class_all:
return 1
if 'Two' in class_all:
return 2
if 'Three' in class_all:
return 3
if 'Four' in class_all:
return 4
if 'Five' in class_all:
return 5