Skip to content
Snippets Groups Projects
Commit 2cc49932 authored by Anton Gusev's avatar Anton Gusev
Browse files

Develop

parent ac75a1c3
No related branches found
No related tags found
No related merge requests found
Showing
with 480 additions and 1 deletion
.*
image
!.gitignore
!.env.example
alembic.ini
*.pyc
*.json
\ No newline at end of file
__pycache__
*.json
src/spiders/test_spider.py
*.log
\ No newline at end of file
1. Rename .env.example to .env and set your settings
2. Rename alembic.ini.example to alembic.ini and set sqlalchemy.url field
3. Install dependencies from requirments.txt file
4. Create tables for database using sql\create-dump.sql or apply migration using the command "alembic upgrade head" in src directory
5. In src directory use "scrapy crawl books" command for start parsing.
\ No newline at end of file
alembic==1.0.10
asn1crypto==0.24.0
attrs==19.1.0
Automat==0.7.0
cffi==1.12.3
constantly==15.1.0
cryptography==2.7
cssselect==1.0.3
hyperlink==19.0.0
idna==2.8
incremental==17.5.0
lxml==4.3.4
Mako==1.0.12
MarkupSafe==1.1.1
parsel==1.5.1
Pillow==6.0.0
pyasn1==0.4.5
pyasn1-modules==0.2.5
pycparser==2.19
PyDispatcher==2.0.5
PyHamcrest==1.9.0
PyMySQL==0.9.3
pyOpenSSL==19.0.0
pypiwin32==223
python-dateutil==2.8.0
python-dotenv==0.10.3
python-editor==1.0.4
pywin32==224
queuelib==1.5.0
Scrapy==1.6.0
service-identity==18.1.0
six==1.12.0
SQLAlchemy==1.3.4
Twisted==19.2.1
w3lib==1.20.0
zope.interface==4.6.0
-- --------------------------------------------------------
-- Хост: 127.0.0.1
-- Версия сервера: 8.0.15 - MySQL Community Server - GPL
-- Операционная система: Win64
-- HeidiSQL Версия: 10.1.0.5464
-- --------------------------------------------------------
/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
/*!40101 SET NAMES utf8 */;
/*!50503 SET NAMES utf8mb4 */;
/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */;
/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */;
-- Дамп структуры для таблица db-scrapy-tutorial.alembic_version
CREATE TABLE IF NOT EXISTS `alembic_version` (
`version_num` varchar(32) NOT NULL,
PRIMARY KEY (`version_num`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
-- Экспортируемые данные не выделены.
-- Дамп структуры для таблица db-scrapy-tutorial.books
CREATE TABLE IF NOT EXISTS `books` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`url` varchar(255) DEFAULT NULL,
`title` varchar(255) NOT NULL,
`description` text,
`image_path` varchar(255) DEFAULT NULL,
`rating` smallint(6) DEFAULT NULL,
`upc` varchar(32) DEFAULT NULL,
`product_type` varchar(32) DEFAULT NULL,
`price_excl_tax` decimal(6,2) DEFAULT NULL,
`price_incl_tax` decimal(6,2) DEFAULT NULL,
`tax` decimal(6,2) DEFAULT NULL,
`in_stock` int(11) DEFAULT NULL,
`count_reviews` int(11) DEFAULT NULL,
`category` varchar(32) DEFAULT NULL,
`currency_type` varchar(4) DEFAULT NULL,
PRIMARY KEY (`id`),
UNIQUE KEY `upc` (`upc`),
KEY `ix_books_category` (`category`),
KEY `ix_books_count_reviews` (`count_reviews`),
KEY `ix_books_currency_type` (`currency_type`),
KEY `ix_books_in_stock` (`in_stock`),
KEY `ix_books_price_excl_tax` (`price_excl_tax`),
KEY `ix_books_price_incl_tax` (`price_incl_tax`),
KEY `ix_books_product_type` (`product_type`),
KEY `ix_books_rating` (`rating`),
KEY `ix_books_tax` (`tax`),
KEY `ix_books_title` (`title`)
) ENGINE=InnoDB AUTO_INCREMENT=2001 DEFAULT CHARSET=utf8;
-- Экспортируемые данные не выделены.
/*!40101 SET SQL_MODE=IFNULL(@OLD_SQL_MODE, '') */;
/*!40014 SET FOREIGN_KEY_CHECKS=IF(@OLD_FOREIGN_KEY_CHECKS IS NULL, 1, @OLD_FOREIGN_KEY_CHECKS) */;
/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
### DATABASE CONFIG
### and need set settings alembic.ini
DB_CONNECTION=mysql+pymysql
DB_HOST=127.0.0.1
DB_PORT=3306
DB_DATABASE=database
DB_USERNAME=username
DB_PASSWORD=secret
### MAIL CONFIG
MAIL_FROM = 'example@example.com'
MAIL_USER = 'example@example.com'
MAIL_PASS = 'secret'
MAIL_HOST = 'smtp.gmail.com'
MAIL_PORT = 465
MAIL_TLS = False
MAIL_SSL = True
### LOGGER CONFIG
### levels: DEBUG, INFO, WARNING, ERROR
LOG_LEVEL = DEBUG
LOG_FILE = ../log.log
LOG_ENABLED = False
LOG_FORMAT = '%(asctime)s - %(levelname)s - %(name)s : %(message)s'
LOG_STDOUT = False
# A generic, single database configuration.
[alembic]
# path to migration scripts
script_location = database
# template used to generate migration files
# file_template = %%(rev)s_%%(slug)s
# timezone to use when rendering the date
# within the migration file as well as the filename.
# string value is passed to dateutil.tz.gettz()file_template
# leave blank for localtime
# timezone =
# max length of characters to apply to the
# "slug" field
# truncate_slug_length = 40
# set to 'true' to run the environment during
# the 'revision' command, regardless of autogenerate
# revision_environment = false
# set to 'true' to allow .pyc and .pyo files without
# a source .py file to be detected as revisions in the
# versions/ directory
# sourceless = false
# version location specification; this defaults
# to database/versions. When using multiple version
# directories, initial revisions must be specified with --version-path
# version_locations = %(here)s/bar %(here)s/bat database/versions
# the output encoding used when revision files
# are written from script.py.mako
# output_encoding = utf-8
# sqlalchemy.url = driver://user:pass@host:port/dbname
sqlalchemy.url = mysql+pymysql://user:password@localhost/databasename
# Logging configuration
[loggers]
keys = root,sqlalchemy,alembic
[handlers]
keys = console
[formatters]
keys = generic
[logger_root]
level = WARN
handlers = console
qualname =
[logger_sqlalchemy]
level = WARN
handlers =
qualname = sqlalchemy.engine
[logger_alembic]
level = INFO
handlers =
qualname = alembic
[handler_console]
class = StreamHandler
args = (sys.stderr,)
level = NOTSET
formatter = generic
[formatter_generic]
format = %(levelname)-5.5s [%(name)s] %(message)s
datefmt = %H:%M:%S
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from scrapy.conf import settings
from utils.Singleton import Singleton
class Connector(metaclass=Singleton):
__connection = None
__engine = create_engine(settings['CONNECTION_STRING'])
__Session = sessionmaker(bind=__engine)
@staticmethod
def get_session():
return Connector.__Session()
@staticmethod
def get_connection():
if not Connector.__connection:
Connector.__connection = Connector.__engine.connect()
return Connector.__connection
Generic single-database configuration.
\ No newline at end of file
from ..Connector import Connector
from ..models.Book import Book
class BookConnector(Connector):
def save_book(self, item):
session = self.get_session()
book = Book()
book.update(item)
book.image_path = item['images'][0]['path']
session.add(book)
session.commit()
\ No newline at end of file
from logging.config import fileConfig
from sqlalchemy import engine_from_config
from sqlalchemy import pool
from alembic import context
import os
import sys
# this is the Alembic Config object, which provides
# access to the values within the .ini file in use.
config = context.config
# Interpret the config file for Python logging.
# This line sets up loggers basically.
fileConfig(config.config_file_name)
# add your model's MetaData object here
# for 'autogenerate' support
# from myapp import mymodel
# target_metadata = mymodel.Base.metadata
# target_metadata = None
# added
sys.path.insert(0, '\\'.join(os.path.dirname(os.path.abspath(__file__)).split('\\')[:-1]))
from database.models.Book import Book
target_metadata = [Book.metadata]
# other values from the config, defined by the needs of env.py,
# can be acquired:
# my_important_option = config.get_main_option("my_important_option")
# ... etc.
def run_migrations_offline():
"""Run migrations in 'offline' mode.
This configures the context with just a URL
and not an Engine, though an Engine is acceptable
here as well. By skipping the Engine creation
we don't even need a DBAPI to be available.
Calls to context.execute() here emit the given string to the
script output.
"""
url = config.get_main_option("sqlalchemy.url")
context.configure(
url=url, target_metadata=target_metadata, literal_binds=True
)
with context.begin_transaction():
context.run_migrations()
def run_migrations_online():
"""Run migrations in 'online' mode.
In this scenario we need to create an Engine
and associate a connection with the context.
"""
connectable = engine_from_config(
config.get_section(config.config_ini_section),
prefix="sqlalchemy.",
poolclass=pool.NullPool,
)
with connectable.connect() as connection:
context.configure(
connection=connection, target_metadata=target_metadata
)
with context.begin_transaction():
context.run_migrations()
if context.is_offline_mode():
run_migrations_offline()
else:
run_migrations_online()
from sqlalchemy import Column, Integer, String, Text, SmallInteger, Numeric
from sqlalchemy.ext.declarative import declarative_base
from .Model import Model
Base = declarative_base()
class Book(Base, Model):
__tablename__ = 'books'
id = Column(Integer, primary_key=True)
url = Column(String(255))
title = Column(String(255), nullable=False, index=True)
description = Column(Text)
image_path = Column(String(255))
rating = Column(SmallInteger, index=True)
upc = Column(String(32), unique=True)
product_type = Column(String(32), index=True)
price_excl_tax = Column(Numeric(6, 2), index=True)
price_incl_tax = Column(Numeric(6, 2), index=True)
tax = Column(Numeric(6, 2), index=True)
in_stock = Column(Integer, index=True)
count_reviews = Column(Integer, index=True)
category = Column(String(32), index=True)
currency_type = Column(String(4), index=True)
class Model:
def update(self, dct):
for key, value in dct.items():
if hasattr(self, key):
setattr(self, key, value)
\ No newline at end of file
# -*- coding: UTF-8 -*-
"""${message}
Revision ID: ${up_revision}
Revises: ${down_revision | comma,n}
Create Date: ${create_date}
"""
from alembic import op
import sqlalchemy as sa
${imports if imports else ""}
# revision identifiers, used by Alembic.
revision = ${repr(up_revision)}
down_revision = ${repr(down_revision)}
branch_labels = ${repr(branch_labels)}
depends_on = ${repr(depends_on)}
def upgrade():
${upgrades if upgrades else "pass"}
def downgrade():
${downgrades if downgrades else "pass"}
"""create books table
Revision ID: 3217c19ef3a6
Revises:
Create Date: 2019-06-13 18:27:06.732796
"""
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = '3217c19ef3a6'
down_revision = None
branch_labels = None
depends_on = None
def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.create_table('books',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('url', sa.String(length=255), nullable=True),
sa.Column('title', sa.String(length=255), nullable=False),
sa.Column('description', sa.Text(), nullable=True),
sa.Column('image_path', sa.String(length=255), nullable=True),
sa.Column('rating', sa.SmallInteger(), nullable=True),
sa.Column('upc', sa.String(length=32), nullable=True),
sa.Column('product_type', sa.String(length=32), nullable=True),
sa.Column('price_excl_tax', sa.Numeric(precision=6, scale=2), nullable=True),
sa.Column('price_incl_tax', sa.Numeric(precision=6, scale=2), nullable=True),
sa.Column('tax', sa.Numeric(precision=6, scale=2), nullable=True),
sa.Column('in_stock', sa.Integer(), nullable=True),
sa.Column('count_reviews', sa.Integer(), nullable=True),
sa.Column('category', sa.String(length=32), nullable=True),
sa.Column('currency_type', sa.String(length=4), nullable=True),
sa.PrimaryKeyConstraint('id'),
sa.UniqueConstraint('upc')
)
op.create_index(op.f('ix_books_category'), 'books', ['category'], unique=False)
op.create_index(op.f('ix_books_count_reviews'), 'books', ['count_reviews'], unique=False)
op.create_index(op.f('ix_books_currency_type'), 'books', ['currency_type'], unique=False)
op.create_index(op.f('ix_books_in_stock'), 'books', ['in_stock'], unique=False)
op.create_index(op.f('ix_books_price_excl_tax'), 'books', ['price_excl_tax'], unique=False)
op.create_index(op.f('ix_books_price_incl_tax'), 'books', ['price_incl_tax'], unique=False)
op.create_index(op.f('ix_books_product_type'), 'books', ['product_type'], unique=False)
op.create_index(op.f('ix_books_rating'), 'books', ['rating'], unique=False)
op.create_index(op.f('ix_books_tax'), 'books', ['tax'], unique=False)
op.create_index(op.f('ix_books_title'), 'books', ['title'], unique=False)
# ### end Alembic commands ###
def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.drop_index(op.f('ix_books_title'), table_name='books')
op.drop_index(op.f('ix_books_tax'), table_name='books')
op.drop_index(op.f('ix_books_rating'), table_name='books')
op.drop_index(op.f('ix_books_product_type'), table_name='books')
op.drop_index(op.f('ix_books_price_incl_tax'), table_name='books')
op.drop_index(op.f('ix_books_price_excl_tax'), table_name='books')
op.drop_index(op.f('ix_books_in_stock'), table_name='books')
op.drop_index(op.f('ix_books_currency_type'), table_name='books')
op.drop_index(op.f('ix_books_count_reviews'), table_name='books')
op.drop_index(op.f('ix_books_category'), table_name='books')
op.drop_table('books')
# ### end Alembic commands ###
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class BookItem(scrapy.Item):
url = scrapy.Field()
title = scrapy.Field()
description = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
rating = scrapy.Field()
upc = scrapy.Field()
product_type = scrapy.Field()
price_excl_tax = scrapy.Field()
price_incl_tax = scrapy.Field()
tax = scrapy.Field()
in_stock = scrapy.Field()
count_reviews = scrapy.Field()
category = scrapy.Field()
currency_type = scrapy.Field()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment