Commit 3e6d68b3 authored by Kyryll Parolis's avatar Kyryll Parolis
Browse files

Finished sitemap spider.

parent b4479c05
venv/
.idea/
src/.env
src/__pycache__
src/database/__pycache__
src/middlewares/__pycache__
src/pipelines/__pycache__
src/spiders/__pycache__
......@@ -18,9 +18,9 @@ depends_on = None
def upgrade():
op.create_table(
'business',
'contractor',
sa.Column('id', sa.Integer, primary_key=True),
sa.Column('business_id', sa.String(255)),
sa.Column('contractor_id', sa.String(255)),
sa.Column('name', sa.String(255)),
sa.Column('category', sa.String(255)),
sa.Column('rate', sa.Integer),
......
......@@ -5,11 +5,11 @@ from sqlalchemy.ext.declarative import declarative_base
DeclarativeBase = declarative_base()
class BusinessData(DeclarativeBase):
class ContractorData(DeclarativeBase):
__tablename__ = 'business'
id = Column(Integer, primary_key=True)
business_id = Column(String(255))
contractor_id = Column(String(255))
name = Column(String(255))
category = Column(String(255))
rate = Column(Integer)
......@@ -27,9 +27,9 @@ class BusinessData(DeclarativeBase):
image = Column(String(255))
imageurl = Column(String(255))
def __init__(self, id=None, business_id=None, name=None, category=None, rate=None, rate_category=None, phone=None, email=None, website=None, is_licensed=None, license_information=None, insured_value=None, bond_value=None, address=None, last_update_date=None, workers_info=None, image=None, imageurl=None):
def __init__(self, id=None, contractor_id=None, name=None, category=None, rate=None, rate_category=None, phone=None, email=None, website=None, is_licensed=None, license_information=None, insured_value=None, bond_value=None, address=None, last_update_date=None, workers_info=None, image=None, imageurl=None):
self.id = id
self.business_id = business_id
self.contractor_id = contractor_id
self.name = name
self.category = category
self.rate = rate
......
......@@ -8,8 +8,8 @@
import scrapy
class BusinessItem(scrapy.Item):
business_id = scrapy.Field()
class ContractorItem(scrapy.Item):
contractor_id = scrapy.Field()
name = scrapy.Field()
description = scrapy.Field()
category = scrapy.Field()
......
# -*- coding: utf-8 -*-
from src.database.connection import db
from src.database.models import BusinessData
from database.connection import db
from database.models import ContractorData
from scrapy.exceptions import DropItem
from sqlalchemy.exc import IntegrityError
......@@ -8,28 +8,28 @@ from sqlalchemy.exc import IntegrityError
class BusinessPipeline:
def process_item(self, item, spider):
record = BusinessData(business_id=item['business_id'],
name=item['name'],
category=item['category'],
rate=item['rate'],
rate_category=item['rate_category'],
phone=item['phone'],
email=item['email'],
website=item['website'],
is_licensed=item['is_licensed'],
license_information=item['license_information'],
insured_value=item['insured_value'],
bond_value=item['bond_value'],
address=item['address'],
last_update_date=item['last_update_date'],
workers_info=item['workers_info'],
image='images/' + item['images'][0]['path'],
imageurl=item['images'][0]['url'])
db.add(record)
try:
db.commit()
except IntegrityError:
db.rollback()
raise DropItem("Duplicate entry.")
# record = ContractorData(name=item['name'],
# # contractor_id=item['contractor_id'],
# category=item['category'],
# rate=item['rate'],
# rate_category=item['rate_category'],
# phone=item['phone'],
# email=item['email'],
# website=item['website'],
# is_licensed=item['is_licensed'],
# license_information=item['license_information'],
# insured_value=item['insured_value'],
# bond_value=item['bond_value'],
# address=item['address'],
# last_update_date=item['last_update_date'],
# workers_info=item['workers_info'],
# image='images/' + item['images'][0]['path'],
# imageurl=item['images'][0]['url'])
# db.add(record)
# try:
# db.commit()
# except IntegrityError:
# db.rollback()
# raise DropItem("Duplicate entry.")
return item
......@@ -4,7 +4,7 @@
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = buildzoom_parser.settings
default = settings
[deploy]
#url = http://localhost:6800/
......
......@@ -21,7 +21,7 @@ NEWSPIDER_MODULE = "spiders"
# USER_AGENT = 'buildzoom_parser (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = os.getenv("CONCURRENT_REQUESTS")
......
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import Spider
class BuildzoomSpider(scrapy.Spider):
name = 'buildzoom'
allowed_domains = ['buildzoom.com']
start_urls = ['http://buildzoom.com/']
class BuildzoomSitemapSpider(Spider):
name = "sitemap"
def parse(self, response):
pass
def start_requests(self):
start_urls = ["https://www.buildzoom.com/sitemap.xml"]
for url in start_urls:
yield scrapy.Request(url, callback=self.parse_sitemap)
def parse_sitemap(self, response: scrapy.http.Response):
text = response.xpath('.//text()').getall()
for link in text:
if "contractors" in link:
yield scrapy.Request(link, callback=self.parse_contractor_urls)
def parse_contractor_urls(self, response):
text = response.xpath('.//text()').getall()
for link in text:
if "contractors" in link:
yield {
'link': link
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment