Commit 4419f630 authored by Kyryll Parolis's avatar Kyryll Parolis
Browse files

Merge branch 'release/0.0.1'

parents 7dc873f0 ece77f6d
venv/
.idea/
src/.env
src/__pycache__
src/database/__pycache__
src/middlewares/__pycache__
src/pipelines/__pycache__
src/spiders/__pycache__
This diff is collapsed.
[tool]
[tool.poetry]
name = "kobo_parser"
version = "0.0.1"
description = "Scrapper for [yelp.com] website."
authors = [
"Kyryll Parolys <kyryllpar@gmail.com>"
]
[tool.poetry.dependencies]
python = "^3.5"
scrapy = "^2.1.0"
mysqlclient = "^1.4.6"
sqlalchemy = "^1.3.17"
alembic = "^1.4.2"
requests = "^2.23.0"
pillow = "^7.1.2"
python-dotenv = "^0.13.0"
pylint = "^2.5.3"
LOG_LEVEL=ERROR
LOG_FORMAT="%(levelname)s: %(message)s"
LOG_FILE=log.txt
DOWNLOAD_DELAY=3
CONCURRENT_REQUESTS=1
DB_USER=
DB_PASS=
DB_HOST=localhost
DB_NAME=scrapy
# A generic, single database configuration.
[alembic]
# path to migration scripts
script_location = alembic
# template used to generate migration files
# file_template = %%(rev)s_%%(slug)s
# timezone to use when rendering the date
# within the migration file as well as the filename.
# string value is passed to dateutil.tz.gettz()
# leave blank for localtime
# timezone =
# max length of characters to apply to the
# "slug" field
# truncate_slug_length = 40
# set to 'true' to run the environment during
# the 'revision' command, regardless of autogenerate
# revision_environment = false
# set to 'true' to allow .pyc and .pyo files without
# a source .py file to be detected as revisions in the
# versions/ directory
# sourceless = false
# version location specification; this defaults
# to alembic/versions. When using multiple version
# directories, initial revisions must be specified with --version-path
# version_locations = %(here)s/bar %(here)s/bat alembic/versions
# the output encoding used when revision files
# are written from script.py.mako
# output_encoding = utf-8
sqlalchemy.url = driver://user:pass@localhost/dbname
[post_write_hooks]
# post_write_hooks defines scripts or Python functions that are run
# on newly generated revision scripts. See the documentation for further
# detail and examples
# format using "black" - use the console_scripts runner, against the "black" entrypoint
# hooks=black
# black.type=console_scripts
# black.entrypoint=black
# black.options=-l 79
# Logging configuration
[loggers]
keys = root,sqlalchemy,alembic
[handlers]
keys = console
[formatters]
keys = generic
[logger_root]
level = WARN
handlers = console
qualname =
[logger_sqlalchemy]
level = WARN
handlers =
qualname = sqlalchemy.engine
[logger_alembic]
level = INFO
handlers =
qualname = alembic
[handler_console]
class = StreamHandler
args = (sys.stderr,)
level = NOTSET
formatter = generic
[formatter_generic]
format = %(levelname)-5.5s [%(name)s] %(message)s
datefmt = %H:%M:%S
Generic single-database configuration.
\ No newline at end of file
from logging.config import fileConfig
from sqlalchemy import engine_from_config
from sqlalchemy import pool
from alembic import context
# this is the Alembic Config object, which provides
# access to the values within the .ini file in use.
config = context.config
# Interpret the config file for Python logging.
# This line sets up loggers basically.
fileConfig(config.config_file_name)
# add your model's MetaData object here
# for 'autogenerate' support
# from myapp import mymodel
# target_metadata = mymodel.Base.metadata
target_metadata = None
# other values from the config, defined by the needs of env.py,
# can be acquired:
# my_important_option = config.get_main_option("my_important_option")
# ... etc.
def run_migrations_offline():
"""Run migrations in 'offline' mode.
This configures the context with just a URL
and not an Engine, though an Engine is acceptable
here as well. By skipping the Engine creation
we don't even need a DBAPI to be available.
Calls to context.execute() here emit the given string to the
script output.
"""
url = config.get_main_option("sqlalchemy.url")
context.configure(
url=url,
target_metadata=target_metadata,
literal_binds=True,
dialect_opts={"paramstyle": "named"},
)
with context.begin_transaction():
context.run_migrations()
def run_migrations_online():
"""Run migrations in 'online' mode.
In this scenario we need to create an Engine
and associate a connection with the context.
"""
connectable = engine_from_config(
config.get_section(config.config_ini_section),
prefix="sqlalchemy.",
poolclass=pool.NullPool,
)
with connectable.connect() as connection:
context.configure(
connection=connection, target_metadata=target_metadata
)
with context.begin_transaction():
context.run_migrations()
if context.is_offline_mode():
run_migrations_offline()
else:
run_migrations_online()
"""${message}
Revision ID: ${up_revision}
Revises: ${down_revision | comma,n}
Create Date: ${create_date}
"""
from alembic import op
import sqlalchemy as sa
${imports if imports else ""}
# revision identifiers, used by Alembic.
revision = ${repr(up_revision)}
down_revision = ${repr(down_revision)}
branch_labels = ${repr(branch_labels)}
depends_on = ${repr(depends_on)}
def upgrade():
${upgrades if upgrades else "pass"}
def downgrade():
${downgrades if downgrades else "pass"}
"""create business table
Revision ID: 4828596288b5
Revises:
Create Date: 2020-06-15 13:17:03.596270
"""
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = '4828596288b5'
down_revision = None
branch_labels = None
depends_on = None
def upgrade():
op.create_table(
'contractor',
sa.Column('id', sa.Integer, primary_key=True),
sa.Column('contractor_id', sa.String(255)),
sa.Column('name', sa.String(255)),
sa.Column('category', sa.String(255)),
sa.Column('rate', sa.Integer),
sa.Column('rate_category', sa.String(255)),
sa.Column('phone', sa.String(255)),
sa.Column('email', sa.String(255)),
sa.Column('website', sa.String(255)),
sa.Column('is_licensed', sa.Boolean),
sa.Column('license_information', sa.String(255)),
sa.Column('insured_value', sa.Integer),
sa.Column('bond_value', sa.Integer),
sa.Column('address', sa.JSON),
sa.Column('last_update_date', sa.Date),
sa.Column('workers_info', sa.String(255)),
sa.Column('image', sa.String(255)),
sa.Column('imageurl', sa.String(255))
)
def downgrade():
op.drop_table('business')
from sqlalchemy import create_engine
from sqlalchemy.orm import scoped_session, sessionmaker
import os
from dotenv import load_dotenv
load_dotenv()
# db settings
db_user = os.getenv("DB_USER")
db_pass = os.getenv("DB_PASS")
db_host = os.getenv("DB_HOST")
db_name = os.getenv("DB_NAME")
engine = create_engine(f"mysql://{db_user}:{db_pass}@{db_host}/{db_name}?charset=utf8&use_unicode=0",
echo=False,
pool_recycle=1800)
db = scoped_session(sessionmaker(autocommit=False,
autoflush=False,
bind=engine))
from sqlalchemy import Column, String, Integer, Boolean, JSON, Date
from sqlalchemy.orm import relationship
from sqlalchemy.ext.declarative import declarative_base
DeclarativeBase = declarative_base()
class ContractorData(DeclarativeBase):
__tablename__ = 'business'
id = Column(Integer, primary_key=True)
contractor_id = Column(String(255))
name = Column(String(255))
category = Column(String(255))
rate = Column(Integer)
rate_category = Column(String(255))
phone = Column(String(255))
email = Column(String(255))
website = Column(String(255))
is_licensed = Column(Boolean)
license_information = Column(String(255))
insured_value = Column(Integer)
bond_value = Column(Integer)
address = Column(JSON)
last_update_date = Column(Date)
workers_info = Column(String(255))
image = Column(String(255))
imageurl = Column(String(255))
def __init__(self, id=None, contractor_id=None, name=None, category=None, rate=None, rate_category=None, phone=None, email=None, website=None, is_licensed=None, license_information=None, insured_value=None, bond_value=None, address=None, last_update_date=None, workers_info=None, image=None, imageurl=None):
self.id = id
self.contractor_id = contractor_id
self.name = name
self.category = category
self.rate = rate
self.rate_category = rate_category
self.phone = phone
self.email = email
self.website = website
self.is_licensed = is_licensed
self.license_information = license_information
self.insured_value = insured_value
self.bond_value = bond_value
self.address = address
self.last_update_date = last_update_date
self.workers_info = workers_info
self.image = image
self.imageurl = imageurl
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class ContractorItem(scrapy.Item):
contractor_id = scrapy.Field()
name = scrapy.Field()
description = scrapy.Field()
category = scrapy.Field()
rate = scrapy.Field()
rate_category = scrapy.Field()
phone = scrapy.Field()
email = scrapy.Field()
website = scrapy.Field()
is_licensed = scrapy.Field()
license_information = scrapy.Field()
insured_value = scrapy.Field()
bond_value = scrapy.Field()
address = scrapy.Field() # country, city, zip_code and full string
last_update_date = scrapy.Field()
workers_info = scrapy.Field()
work_preference = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from dotenv import load_dotenv
from scrapy import signals
import os
class BuildzoomParserSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class BuildzoomParserDownloaderMiddleware:
load_dotenv()
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
request.meta['proxy'] = os.getenv('http_proxy')
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment