Commit c1c0513d authored by Kyryll Parolis's avatar Kyryll Parolis
Browse files

Saving sitemap data to db.

parent e4e1491e
......@@ -6,3 +6,5 @@ src/database/__pycache__
src/middlewares/__pycache__
src/pipelines/__pycache__
src/spiders/__pycache__
src/alembic/versions/__pycache__
src/alembic/__pycache__
......@@ -35,7 +35,8 @@ script_location = alembic
# are written from script.py.mako
# output_encoding = utf-8
sqlalchemy.url = driver://user:pass@localhost/dbname
# sqlalchemy.url = driver://user:pass@localhost/dbname
sqlalchemy.url = mysql://root:Bc27981!cgfqlthvty1@localhost/buildzoom
[post_write_hooks]
......
"""create sitemap table
Revision ID: 11c7799508d3
Revises: 4828596288b5
Create Date: 2020-06-16 16:51:50.292434
"""
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = "11c7799508d3"
down_revision = "4828596288b5"
branch_labels = None
depends_on = None
def upgrade():
op.create_table(
"sitemap",
sa.Column("id", sa.Integer, primary_key=True),
sa.Column("link", sa.String(255)),
)
def downgrade():
op.drop_table("sitemap")
......@@ -10,7 +10,7 @@ import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = '4828596288b5'
revision = "4828596288b5"
down_revision = None
branch_labels = None
depends_on = None
......@@ -18,27 +18,27 @@ depends_on = None
def upgrade():
op.create_table(
'contractor',
sa.Column('id', sa.Integer, primary_key=True),
sa.Column('contractor_id', sa.String(255)),
sa.Column('name', sa.String(255)),
sa.Column('category', sa.String(255)),
sa.Column('rate', sa.Integer),
sa.Column('rate_category', sa.String(255)),
sa.Column('phone', sa.String(255)),
sa.Column('email', sa.String(255)),
sa.Column('website', sa.String(255)),
sa.Column('is_licensed', sa.Boolean),
sa.Column('license_information', sa.String(255)),
sa.Column('insured_value', sa.Integer),
sa.Column('bond_value', sa.Integer),
sa.Column('address', sa.JSON),
sa.Column('last_update_date', sa.Date),
sa.Column('workers_info', sa.String(255)),
sa.Column('image', sa.String(255)),
sa.Column('imageurl', sa.String(255))
"contractor",
sa.Column("id", sa.Integer, primary_key=True),
sa.Column("contractor_id", sa.String(255)),
sa.Column("name", sa.String(255)),
sa.Column("category", sa.String(255)),
sa.Column("rate", sa.Integer),
sa.Column("rate_category", sa.String(255)),
sa.Column("phone", sa.String(255)),
sa.Column("email", sa.String(255)),
sa.Column("website", sa.String(255)),
sa.Column("is_licensed", sa.Boolean),
sa.Column("license_information", sa.String(255)),
sa.Column("insured_value", sa.Integer),
sa.Column("bond_value", sa.Integer),
sa.Column("address", sa.JSON),
sa.Column("last_update_date", sa.Date),
sa.Column("workers_info", sa.String(255)),
sa.Column("image", sa.String(255)),
sa.Column("imageurl", sa.String(255)),
)
def downgrade():
op.drop_table('business')
op.drop_table("business")
from sqlalchemy import Column, String, Integer, Boolean, JSON, Date
from sqlalchemy.orm import relationship
from sqlalchemy.ext.declarative import declarative_base
DeclarativeBase = declarative_base()
......@@ -46,3 +45,14 @@ class ContractorData(DeclarativeBase):
self.workers_info = workers_info
self.image = image
self.imageurl = imageurl
class SitemapData(DeclarativeBase):
__tablename__ = "sitemap"
id = Column(Integer, primary_key=True)
link = Column(String(255))
def __init__(self, id=None, link=None):
self.id = id
self.link = link
......@@ -28,3 +28,7 @@ class ContractorItem(scrapy.Item):
work_preference = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
class SitemapItem(scrapy.Item):
link = scrapy.Field()
# -*- coding: utf-8 -*-
from database.connection import db
from database.models import ContractorData
from database.models import ContractorData, SitemapData
from scrapy.exceptions import DropItem
from sqlalchemy.exc import IntegrityError
class BusinessPipeline:
class ContractorsPipeline:
def process_item(self, item, spider):
# record = ContractorData(name=item['name'],
# # contractor_id=item['contractor_id'],
# category=item['category'],
# rate=item['rate'],
# rate_category=item['rate_category'],
# phone=item['phone'],
# email=item['email'],
# website=item['website'],
# is_licensed=item['is_licensed'],
# license_information=item['license_information'],
# insured_value=item['insured_value'],
# bond_value=item['bond_value'],
# address=item['address'],
# last_update_date=item['last_update_date'],
# workers_info=item['workers_info'],
# image='images/' + item['images'][0]['path'],
# imageurl=item['images'][0]['url'])
# db.add(record)
# try:
# db.commit()
# except IntegrityError:
# db.rollback()
# raise DropItem("Duplicate entry.")
return item
class SitemapPipeline:
def process_item(self, item, spider):
record = SitemapData(link=item['link'])
db.add(record)
try:
db.commit()
except IntegrityError:
db.rollback()
raise DropItem('Duplicate entry.')
......@@ -70,11 +70,11 @@ DOWNLOADER_MIDDLEWARES = {
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'scrapy.pipelines.images.ImagesPipeline': 1,
'pipelines.pipelines.BusinessPipeline': 300,
}
IMAGES_STORE = 'images'
# ITEM_PIPELINES = {
# 'scrapy.pipelines.images.ImagesPipeline': 1,
# 'pipelines.pipelines.BusinessPipeline': 300,
# }
# IMAGES_STORE = 'images'
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
......
......@@ -5,6 +5,11 @@ from scrapy.spiders import Spider
class BuildzoomSitemapSpider(Spider):
name = "sitemap"
custom_settings = {
"ITEM_PIPELINES": {
'pipelines.pipelines.SitemapPipeline': 400
}
}
def start_requests(self):
start_urls = ["https://www.buildzoom.com/sitemap.xml"]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment