Commit b4479c05 authored by Kyryll Parolis's avatar Kyryll Parolis
Browse files

Created item, pipeline, middleware. Added first alembic revision.

parent 08debdc4
# A generic, single database configuration.
[alembic]
# path to migration scripts
script_location = alembic
# template used to generate migration files
# file_template = %%(rev)s_%%(slug)s
# timezone to use when rendering the date
# within the migration file as well as the filename.
# string value is passed to dateutil.tz.gettz()
# leave blank for localtime
# timezone =
# max length of characters to apply to the
# "slug" field
# truncate_slug_length = 40
# set to 'true' to run the environment during
# the 'revision' command, regardless of autogenerate
# revision_environment = false
# set to 'true' to allow .pyc and .pyo files without
# a source .py file to be detected as revisions in the
# versions/ directory
# sourceless = false
# version location specification; this defaults
# to alembic/versions. When using multiple version
# directories, initial revisions must be specified with --version-path
# version_locations = %(here)s/bar %(here)s/bat alembic/versions
# the output encoding used when revision files
# are written from script.py.mako
# output_encoding = utf-8
sqlalchemy.url = driver://user:pass@localhost/dbname
[post_write_hooks]
# post_write_hooks defines scripts or Python functions that are run
# on newly generated revision scripts. See the documentation for further
# detail and examples
# format using "black" - use the console_scripts runner, against the "black" entrypoint
# hooks=black
# black.type=console_scripts
# black.entrypoint=black
# black.options=-l 79
# Logging configuration
[loggers]
keys = root,sqlalchemy,alembic
[handlers]
keys = console
[formatters]
keys = generic
[logger_root]
level = WARN
handlers = console
qualname =
[logger_sqlalchemy]
level = WARN
handlers =
qualname = sqlalchemy.engine
[logger_alembic]
level = INFO
handlers =
qualname = alembic
[handler_console]
class = StreamHandler
args = (sys.stderr,)
level = NOTSET
formatter = generic
[formatter_generic]
format = %(levelname)-5.5s [%(name)s] %(message)s
datefmt = %H:%M:%S
LOG_LEVEL=ERROR
LOG_FORMAT="%(levelname)s: %(message)s"
LOG_FILE=log.txt
DOWNLOAD_DELAY=3
CONCURRENT_REQUESTS=1
DB_USER=
DB_PASS=
DB_HOST=localhost
DB_NAME=scrapy
Generic single-database configuration.
\ No newline at end of file
from logging.config import fileConfig
from sqlalchemy import engine_from_config
from sqlalchemy import pool
from alembic import context
# this is the Alembic Config object, which provides
# access to the values within the .ini file in use.
config = context.config
# Interpret the config file for Python logging.
# This line sets up loggers basically.
fileConfig(config.config_file_name)
# add your model's MetaData object here
# for 'autogenerate' support
# from myapp import mymodel
# target_metadata = mymodel.Base.metadata
target_metadata = None
# other values from the config, defined by the needs of env.py,
# can be acquired:
# my_important_option = config.get_main_option("my_important_option")
# ... etc.
def run_migrations_offline():
"""Run migrations in 'offline' mode.
This configures the context with just a URL
and not an Engine, though an Engine is acceptable
here as well. By skipping the Engine creation
we don't even need a DBAPI to be available.
Calls to context.execute() here emit the given string to the
script output.
"""
url = config.get_main_option("sqlalchemy.url")
context.configure(
url=url,
target_metadata=target_metadata,
literal_binds=True,
dialect_opts={"paramstyle": "named"},
)
with context.begin_transaction():
context.run_migrations()
def run_migrations_online():
"""Run migrations in 'online' mode.
In this scenario we need to create an Engine
and associate a connection with the context.
"""
connectable = engine_from_config(
config.get_section(config.config_ini_section),
prefix="sqlalchemy.",
poolclass=pool.NullPool,
)
with connectable.connect() as connection:
context.configure(
connection=connection, target_metadata=target_metadata
)
with context.begin_transaction():
context.run_migrations()
if context.is_offline_mode():
run_migrations_offline()
else:
run_migrations_online()
"""${message}
Revision ID: ${up_revision}
Revises: ${down_revision | comma,n}
Create Date: ${create_date}
"""
from alembic import op
import sqlalchemy as sa
${imports if imports else ""}
# revision identifiers, used by Alembic.
revision = ${repr(up_revision)}
down_revision = ${repr(down_revision)}
branch_labels = ${repr(branch_labels)}
depends_on = ${repr(depends_on)}
def upgrade():
${upgrades if upgrades else "pass"}
def downgrade():
${downgrades if downgrades else "pass"}
"""create business table
Revision ID: 4828596288b5
Revises:
Create Date: 2020-06-15 13:17:03.596270
"""
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = '4828596288b5'
down_revision = None
branch_labels = None
depends_on = None
def upgrade():
op.create_table(
'business',
sa.Column('id', sa.Integer, primary_key=True),
sa.Column('business_id', sa.String(255)),
sa.Column('name', sa.String(255)),
sa.Column('category', sa.String(255)),
sa.Column('rate', sa.Integer),
sa.Column('rate_category', sa.String(255)),
sa.Column('phone', sa.String(255)),
sa.Column('email', sa.String(255)),
sa.Column('website', sa.String(255)),
sa.Column('is_licensed', sa.Boolean),
sa.Column('license_information', sa.String(255)),
sa.Column('insured_value', sa.Integer),
sa.Column('bond_value', sa.Integer),
sa.Column('address', sa.JSON),
sa.Column('last_update_date', sa.Date),
sa.Column('workers_info', sa.String(255)),
sa.Column('image', sa.String(255)),
sa.Column('imageurl', sa.String(255))
)
def downgrade():
op.drop_table('business')
from sqlalchemy import create_engine
from sqlalchemy.orm import scoped_session, sessionmaker
import os
from dotenv import load_dotenv
load_dotenv()
# db settings
db_user = os.getenv("DB_USER")
db_pass = os.getenv("DB_PASS")
db_host = os.getenv("DB_HOST")
db_name = os.getenv("DB_NAME")
engine = create_engine(f"mysql://{db_user}:{db_pass}@{db_host}/{db_name}?charset=utf8&use_unicode=0",
echo=False,
pool_recycle=1800)
db = scoped_session(sessionmaker(autocommit=False,
autoflush=False,
bind=engine))
from sqlalchemy import Column, String, Integer, Boolean, JSON, Date
from sqlalchemy.orm import relationship
from sqlalchemy.ext.declarative import declarative_base
DeclarativeBase = declarative_base()
class BusinessData(DeclarativeBase):
__tablename__ = 'business'
id = Column(Integer, primary_key=True)
business_id = Column(String(255))
name = Column(String(255))
category = Column(String(255))
rate = Column(Integer)
rate_category = Column(String(255))
phone = Column(String(255))
email = Column(String(255))
website = Column(String(255))
is_licensed = Column(Boolean)
license_information = Column(String(255))
insured_value = Column(Integer)
bond_value = Column(Integer)
address = Column(JSON)
last_update_date = Column(Date)
workers_info = Column(String(255))
image = Column(String(255))
imageurl = Column(String(255))
def __init__(self, id=None, business_id=None, name=None, category=None, rate=None, rate_category=None, phone=None, email=None, website=None, is_licensed=None, license_information=None, insured_value=None, bond_value=None, address=None, last_update_date=None, workers_info=None, image=None, imageurl=None):
self.id = id
self.business_id = business_id
self.name = name
self.category = category
self.rate = rate
self.rate_category = rate_category
self.phone = phone
self.email = email
self.website = website
self.is_licensed = is_licensed
self.license_information = license_information
self.insured_value = insured_value
self.bond_value = bond_value
self.address = address
self.last_update_date = last_update_date
self.workers_info = workers_info
self.image = image
self.imageurl = imageurl
......@@ -8,7 +8,23 @@
import scrapy
class BuildzoomParserItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
class BusinessItem(scrapy.Item):
business_id = scrapy.Field()
name = scrapy.Field()
description = scrapy.Field()
category = scrapy.Field()
rate = scrapy.Field()
rate_category = scrapy.Field()
phone = scrapy.Field()
email = scrapy.Field()
website = scrapy.Field()
is_licensed = scrapy.Field()
license_information = scrapy.Field()
insured_value = scrapy.Field()
bond_value = scrapy.Field()
address = scrapy.Field() # country, city, zip_code and full string
last_update_date = scrapy.Field()
workers_info = scrapy.Field()
work_preference = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
......@@ -4,8 +4,9 @@
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from dotenv import load_dotenv
from scrapy import signals
import os
class BuildzoomParserSpiderMiddleware:
......@@ -57,6 +58,7 @@ class BuildzoomParserSpiderMiddleware:
class BuildzoomParserDownloaderMiddleware:
load_dotenv()
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
......@@ -69,16 +71,8 @@ class BuildzoomParserDownloaderMiddleware:
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
request.meta['proxy'] = os.getenv('http_proxy')
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
......
# -*- coding: utf-8 -*-
from src.database.connection import db
from src.database.models import BusinessData
from scrapy.exceptions import DropItem
from sqlalchemy.exc import IntegrityError
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
class BusinessPipeline:
class BuildzoomParserPipeline:
def process_item(self, item, spider):
record = BusinessData(business_id=item['business_id'],
name=item['name'],
category=item['category'],
rate=item['rate'],
rate_category=item['rate_category'],
phone=item['phone'],
email=item['email'],
website=item['website'],
is_licensed=item['is_licensed'],
license_information=item['license_information'],
insured_value=item['insured_value'],
bond_value=item['bond_value'],
address=item['address'],
last_update_date=item['last_update_date'],
workers_info=item['workers_info'],
image='images/' + item['images'][0]['path'],
imageurl=item['images'][0]['url'])
db.add(record)
try:
db.commit()
except IntegrityError:
db.rollback()
raise DropItem("Duplicate entry.")
return item
import os
from dotenv import load_dotenv
# -*- coding: utf-8 -*-
# Scrapy settings for buildzoom_parser project
......@@ -8,83 +10,89 @@
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
load_dotenv()
BOT_NAME = "buildzoom_parser"
BOT_NAME = 'buildzoom_parser'
SPIDER_MODULES = ['buildzoom_parser.spiders']
NEWSPIDER_MODULE = 'buildzoom_parser.spiders'
SPIDER_MODULES = ["spiders"]
NEWSPIDER_MODULE = "spiders"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'buildzoom_parser (+http://www.yourdomain.com)'
# USER_AGENT = 'buildzoom_parser (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
CONCURRENT_REQUESTS = os.getenv("CONCURRENT_REQUESTS")
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
DOWNLOAD_DELAY = os.getenv("DOWNLOAD_DELAY")
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
DEFAULT_REQUEST_HEADERS = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "en-US",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.165 Safari/537.36",
}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# SPIDER_MIDDLEWARES = {
# 'buildzoom_parser.middlewares.BuildzoomParserSpiderMiddleware': 543,
#}
# }
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'buildzoom_parser.middlewares.BuildzoomParserDownloaderMiddleware': 543,
#}
DOWNLOADER_MIDDLEWARES = {
'middlewares.middlewares.BuildzoomParserDownloaderMiddleware': 350,
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 400,
}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# }
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'buildzoom_parser.pipelines.BuildzoomParserPipeline': 300,
#}
ITEM_PIPELINES = {
'scrapy.pipelines.images.ImagesPipeline': 1,
'pipelines.pipelines.BusinessPipeline': 300,
}
IMAGES_STORE = 'images'
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment