diff --git a/.gitignore b/.gitignore index 2d56f814c04e94d4f9e7a278c044b56f719adeb5..523b4a0d6e2a4836e1b97794bc95711d77bc71fc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,10 @@ .* +image !.gitignore +!.env.example +alembic.ini *.pyc -*.json \ No newline at end of file +__pycache__ +*.json +src/spiders/test_spider.py +*.log \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..35436e837e1967423283a17d9d5c32c67b48d8d9 --- /dev/null +++ b/README.md @@ -0,0 +1,5 @@ +1. Rename .env.example to .env and set your settings +2. Rename alembic.ini.example to alembic.ini and set sqlalchemy.url field +3. Install dependencies from requirments.txt file +4. Create tables for database using sql\create-dump.sql or apply migration using the command "alembic upgrade head" in src directory +5. In src directory use "scrapy crawl books" command for start parsing. \ No newline at end of file diff --git a/requirments.txt b/requirments.txt new file mode 100644 index 0000000000000000000000000000000000000000..00f7343300a4d6eafb565c57b224953cd7e1b779 --- /dev/null +++ b/requirments.txt @@ -0,0 +1,36 @@ +alembic==1.0.10 +asn1crypto==0.24.0 +attrs==19.1.0 +Automat==0.7.0 +cffi==1.12.3 +constantly==15.1.0 +cryptography==2.7 +cssselect==1.0.3 +hyperlink==19.0.0 +idna==2.8 +incremental==17.5.0 +lxml==4.3.4 +Mako==1.0.12 +MarkupSafe==1.1.1 +parsel==1.5.1 +Pillow==6.0.0 +pyasn1==0.4.5 +pyasn1-modules==0.2.5 +pycparser==2.19 +PyDispatcher==2.0.5 +PyHamcrest==1.9.0 +PyMySQL==0.9.3 +pyOpenSSL==19.0.0 +pypiwin32==223 +python-dateutil==2.8.0 +python-dotenv==0.10.3 +python-editor==1.0.4 +pywin32==224 +queuelib==1.5.0 +Scrapy==1.6.0 +service-identity==18.1.0 +six==1.12.0 +SQLAlchemy==1.3.4 +Twisted==19.2.1 +w3lib==1.20.0 +zope.interface==4.6.0 diff --git a/sql/.gitignore b/sql/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/sql/create-dump.sql b/sql/create-dump.sql new file mode 100644 index 0000000000000000000000000000000000000000..c56003f5c6ae685d94073b5f19f53696d41fed61 --- /dev/null +++ b/sql/create-dump.sql @@ -0,0 +1,55 @@ +-- -------------------------------------------------------- +-- ХоÑÑ‚: 127.0.0.1 +-- ВерÑÐ¸Ñ Ñервера: 8.0.15 - MySQL Community Server - GPL +-- ÐžÐ¿ÐµÑ€Ð°Ñ†Ð¸Ð¾Ð½Ð½Ð°Ñ ÑиÑтема: Win64 +-- HeidiSQL ВерÑиÑ: 10.1.0.5464 +-- -------------------------------------------------------- + +/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; +/*!40101 SET NAMES utf8 */; +/*!50503 SET NAMES utf8mb4 */; +/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */; +/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */; + +-- Дамп Ñтруктуры Ð´Ð»Ñ Ñ‚Ð°Ð±Ð»Ð¸Ñ†Ð° db-scrapy-tutorial.alembic_version +CREATE TABLE IF NOT EXISTS `alembic_version` ( + `version_num` varchar(32) NOT NULL, + PRIMARY KEY (`version_num`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8; + +-- ÐкÑпортируемые данные не выделены. +-- Дамп Ñтруктуры Ð´Ð»Ñ Ñ‚Ð°Ð±Ð»Ð¸Ñ†Ð° db-scrapy-tutorial.books +CREATE TABLE IF NOT EXISTS `books` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `url` varchar(255) DEFAULT NULL, + `title` varchar(255) NOT NULL, + `description` text, + `image_path` varchar(255) DEFAULT NULL, + `rating` smallint(6) DEFAULT NULL, + `upc` varchar(32) DEFAULT NULL, + `product_type` varchar(32) DEFAULT NULL, + `price_excl_tax` decimal(6,2) DEFAULT NULL, + `price_incl_tax` decimal(6,2) DEFAULT NULL, + `tax` decimal(6,2) DEFAULT NULL, + `in_stock` int(11) DEFAULT NULL, + `count_reviews` int(11) DEFAULT NULL, + `category` varchar(32) DEFAULT NULL, + `currency_type` varchar(4) DEFAULT NULL, + PRIMARY KEY (`id`), + UNIQUE KEY `upc` (`upc`), + KEY `ix_books_category` (`category`), + KEY `ix_books_count_reviews` (`count_reviews`), + KEY `ix_books_currency_type` (`currency_type`), + KEY `ix_books_in_stock` (`in_stock`), + KEY `ix_books_price_excl_tax` (`price_excl_tax`), + KEY `ix_books_price_incl_tax` (`price_incl_tax`), + KEY `ix_books_product_type` (`product_type`), + KEY `ix_books_rating` (`rating`), + KEY `ix_books_tax` (`tax`), + KEY `ix_books_title` (`title`) +) ENGINE=InnoDB AUTO_INCREMENT=2001 DEFAULT CHARSET=utf8; + +-- ÐкÑпортируемые данные не выделены. +/*!40101 SET SQL_MODE=IFNULL(@OLD_SQL_MODE, '') */; +/*!40014 SET FOREIGN_KEY_CHECKS=IF(@OLD_FOREIGN_KEY_CHECKS IS NULL, 1, @OLD_FOREIGN_KEY_CHECKS) */; +/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; diff --git a/src/.env.example b/src/.env.example new file mode 100644 index 0000000000000000000000000000000000000000..b9541b7d01a71c75f5c97abfb5ed7348a70db675 --- /dev/null +++ b/src/.env.example @@ -0,0 +1,28 @@ +### DATABASE CONFIG +### and need set settings alembic.ini +DB_CONNECTION=mysql+pymysql +DB_HOST=127.0.0.1 +DB_PORT=3306 +DB_DATABASE=database +DB_USERNAME=username +DB_PASSWORD=secret + +### MAIL CONFIG +MAIL_FROM = 'example@example.com' +MAIL_USER = 'example@example.com' +MAIL_PASS = 'secret' +MAIL_HOST = 'smtp.gmail.com' +MAIL_PORT = 465 +MAIL_TLS = False +MAIL_SSL = True + +### LOGGER CONFIG +### levels: DEBUG, INFO, WARNING, ERROR +LOG_LEVEL = DEBUG +LOG_FILE = ../log.log +LOG_ENABLED = False +LOG_FORMAT = '%(asctime)s - %(levelname)s - %(name)s : %(message)s' +LOG_STDOUT = False + + + diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/alembic.ini.example b/src/alembic.ini.example new file mode 100644 index 0000000000000000000000000000000000000000..fe3f6a73d740f8fbe643b428b5bdfe2a3042b052 --- /dev/null +++ b/src/alembic.ini.example @@ -0,0 +1,75 @@ +# A generic, single database configuration. + +[alembic] +# path to migration scripts +script_location = database + +# template used to generate migration files +# file_template = %%(rev)s_%%(slug)s + +# timezone to use when rendering the date +# within the migration file as well as the filename. +# string value is passed to dateutil.tz.gettz()file_template +# leave blank for localtime +# timezone = + +# max length of characters to apply to the +# "slug" field +# truncate_slug_length = 40 + +# set to 'true' to run the environment during +# the 'revision' command, regardless of autogenerate +# revision_environment = false + +# set to 'true' to allow .pyc and .pyo files without +# a source .py file to be detected as revisions in the +# versions/ directory +# sourceless = false + +# version location specification; this defaults +# to database/versions. When using multiple version +# directories, initial revisions must be specified with --version-path +# version_locations = %(here)s/bar %(here)s/bat database/versions + +# the output encoding used when revision files +# are written from script.py.mako +# output_encoding = utf-8 + +# sqlalchemy.url = driver://user:pass@host:port/dbname +sqlalchemy.url = mysql+pymysql://user:password@localhost/databasename + + +# Logging configuration +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARN +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARN +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/src/database/Connector.py b/src/database/Connector.py new file mode 100644 index 0000000000000000000000000000000000000000..3641bcd46d73fd64ee0ad8a8b1471e7f7adab609 --- /dev/null +++ b/src/database/Connector.py @@ -0,0 +1,20 @@ +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker +from scrapy.conf import settings +from utils.Singleton import Singleton + + +class Connector(metaclass=Singleton): + __connection = None + __engine = create_engine(settings['CONNECTION_STRING']) + __Session = sessionmaker(bind=__engine) + + @staticmethod + def get_session(): + return Connector.__Session() + + @staticmethod + def get_connection(): + if not Connector.__connection: + Connector.__connection = Connector.__engine.connect() + return Connector.__connection diff --git a/src/database/README b/src/database/README new file mode 100644 index 0000000000000000000000000000000000000000..98e4f9c44effe479ed38c66ba922e7bcc672916f --- /dev/null +++ b/src/database/README @@ -0,0 +1 @@ +Generic single-database configuration. \ No newline at end of file diff --git a/src/database/__init__.py b/src/database/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/database/connectors/BookConnector.py b/src/database/connectors/BookConnector.py new file mode 100644 index 0000000000000000000000000000000000000000..bcabcbebb00610a338089dbd6dd959043b11b7ff --- /dev/null +++ b/src/database/connectors/BookConnector.py @@ -0,0 +1,14 @@ +from ..Connector import Connector +from ..models.Book import Book + + +class BookConnector(Connector): + def save_book(self, item): + session = self.get_session() + + book = Book() + book.update(item) + book.image_path = item['images'][0]['path'] + + session.add(book) + session.commit() \ No newline at end of file diff --git a/src/database/connectors/__init__.py b/src/database/connectors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/database/env.py b/src/database/env.py new file mode 100644 index 0000000000000000000000000000000000000000..62621fb1da2fbb7e947d4a8edc3cbaaf517deac4 --- /dev/null +++ b/src/database/env.py @@ -0,0 +1,83 @@ + +from logging.config import fileConfig + +from sqlalchemy import engine_from_config +from sqlalchemy import pool + +from alembic import context +import os +import sys + + +# this is the Alembic Config object, which provides +# access to the values within the .ini file in use. +config = context.config + +# Interpret the config file for Python logging. +# This line sets up loggers basically. +fileConfig(config.config_file_name) + +# add your model's MetaData object here +# for 'autogenerate' support +# from myapp import mymodel +# target_metadata = mymodel.Base.metadata +# target_metadata = None + +# added +sys.path.insert(0, '\\'.join(os.path.dirname(os.path.abspath(__file__)).split('\\')[:-1])) +from database.models.Book import Book +target_metadata = [Book.metadata] + +# other values from the config, defined by the needs of env.py, +# can be acquired: +# my_important_option = config.get_main_option("my_important_option") +# ... etc. + + +def run_migrations_offline(): + """Run migrations in 'offline' mode. + + This configures the context with just a URL + and not an Engine, though an Engine is acceptable + here as well. By skipping the Engine creation + we don't even need a DBAPI to be available. + + Calls to context.execute() here emit the given string to the + script output. + + """ + url = config.get_main_option("sqlalchemy.url") + context.configure( + url=url, target_metadata=target_metadata, literal_binds=True + ) + + with context.begin_transaction(): + context.run_migrations() + + +def run_migrations_online(): + """Run migrations in 'online' mode. + + In this scenario we need to create an Engine + and associate a connection with the context. + + """ + connectable = engine_from_config( + config.get_section(config.config_ini_section), + prefix="sqlalchemy.", + poolclass=pool.NullPool, + ) + + with connectable.connect() as connection: + context.configure( + connection=connection, target_metadata=target_metadata + ) + + with context.begin_transaction(): + context.run_migrations() + + +if context.is_offline_mode(): + run_migrations_offline() +else: + run_migrations_online() diff --git a/src/database/models/Book.py b/src/database/models/Book.py new file mode 100644 index 0000000000000000000000000000000000000000..bda89a4718947541582a55ed1904107f953bf374 --- /dev/null +++ b/src/database/models/Book.py @@ -0,0 +1,28 @@ +from sqlalchemy import Column, Integer, String, Text, SmallInteger, Numeric +from sqlalchemy.ext.declarative import declarative_base +from .Model import Model + +Base = declarative_base() + + +class Book(Base, Model): + __tablename__ = 'books' + id = Column(Integer, primary_key=True) + url = Column(String(255)) + + title = Column(String(255), nullable=False, index=True) + description = Column(Text) + image_path = Column(String(255)) + rating = Column(SmallInteger, index=True) + + upc = Column(String(32), unique=True) + product_type = Column(String(32), index=True) + price_excl_tax = Column(Numeric(6, 2), index=True) + price_incl_tax = Column(Numeric(6, 2), index=True) + tax = Column(Numeric(6, 2), index=True) + in_stock = Column(Integer, index=True) + count_reviews = Column(Integer, index=True) + + category = Column(String(32), index=True) + + currency_type = Column(String(4), index=True) diff --git a/src/database/models/Model.py b/src/database/models/Model.py new file mode 100644 index 0000000000000000000000000000000000000000..5cdcba2a49d149cc67ca54d5010ac74113c2a8bd --- /dev/null +++ b/src/database/models/Model.py @@ -0,0 +1,7 @@ + + +class Model: + def update(self, dct): + for key, value in dct.items(): + if hasattr(self, key): + setattr(self, key, value) \ No newline at end of file diff --git a/src/database/models/__init__.py b/src/database/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/database/script.py.mako b/src/database/script.py.mako new file mode 100644 index 0000000000000000000000000000000000000000..916ce7f38598baddefd8718223d64ef2322dbdd5 --- /dev/null +++ b/src/database/script.py.mako @@ -0,0 +1,25 @@ +# -*- coding: UTF-8 -*- +"""${message} + +Revision ID: ${up_revision} +Revises: ${down_revision | comma,n} +Create Date: ${create_date} + +""" +from alembic import op +import sqlalchemy as sa +${imports if imports else ""} + +# revision identifiers, used by Alembic. +revision = ${repr(up_revision)} +down_revision = ${repr(down_revision)} +branch_labels = ${repr(branch_labels)} +depends_on = ${repr(depends_on)} + + +def upgrade(): + ${upgrades if upgrades else "pass"} + + +def downgrade(): + ${downgrades if downgrades else "pass"} diff --git a/src/database/versions/3217c19ef3a6_create_books_table.py b/src/database/versions/3217c19ef3a6_create_books_table.py new file mode 100644 index 0000000000000000000000000000000000000000..c819ceb744c027c8b6b941a4c3b82c68f116b862 --- /dev/null +++ b/src/database/versions/3217c19ef3a6_create_books_table.py @@ -0,0 +1,66 @@ +"""create books table + +Revision ID: 3217c19ef3a6 +Revises: +Create Date: 2019-06-13 18:27:06.732796 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = '3217c19ef3a6' +down_revision = None +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('books', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('url', sa.String(length=255), nullable=True), + sa.Column('title', sa.String(length=255), nullable=False), + sa.Column('description', sa.Text(), nullable=True), + sa.Column('image_path', sa.String(length=255), nullable=True), + sa.Column('rating', sa.SmallInteger(), nullable=True), + sa.Column('upc', sa.String(length=32), nullable=True), + sa.Column('product_type', sa.String(length=32), nullable=True), + sa.Column('price_excl_tax', sa.Numeric(precision=6, scale=2), nullable=True), + sa.Column('price_incl_tax', sa.Numeric(precision=6, scale=2), nullable=True), + sa.Column('tax', sa.Numeric(precision=6, scale=2), nullable=True), + sa.Column('in_stock', sa.Integer(), nullable=True), + sa.Column('count_reviews', sa.Integer(), nullable=True), + sa.Column('category', sa.String(length=32), nullable=True), + sa.Column('currency_type', sa.String(length=4), nullable=True), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('upc') + ) + op.create_index(op.f('ix_books_category'), 'books', ['category'], unique=False) + op.create_index(op.f('ix_books_count_reviews'), 'books', ['count_reviews'], unique=False) + op.create_index(op.f('ix_books_currency_type'), 'books', ['currency_type'], unique=False) + op.create_index(op.f('ix_books_in_stock'), 'books', ['in_stock'], unique=False) + op.create_index(op.f('ix_books_price_excl_tax'), 'books', ['price_excl_tax'], unique=False) + op.create_index(op.f('ix_books_price_incl_tax'), 'books', ['price_incl_tax'], unique=False) + op.create_index(op.f('ix_books_product_type'), 'books', ['product_type'], unique=False) + op.create_index(op.f('ix_books_rating'), 'books', ['rating'], unique=False) + op.create_index(op.f('ix_books_tax'), 'books', ['tax'], unique=False) + op.create_index(op.f('ix_books_title'), 'books', ['title'], unique=False) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index(op.f('ix_books_title'), table_name='books') + op.drop_index(op.f('ix_books_tax'), table_name='books') + op.drop_index(op.f('ix_books_rating'), table_name='books') + op.drop_index(op.f('ix_books_product_type'), table_name='books') + op.drop_index(op.f('ix_books_price_incl_tax'), table_name='books') + op.drop_index(op.f('ix_books_price_excl_tax'), table_name='books') + op.drop_index(op.f('ix_books_in_stock'), table_name='books') + op.drop_index(op.f('ix_books_currency_type'), table_name='books') + op.drop_index(op.f('ix_books_count_reviews'), table_name='books') + op.drop_index(op.f('ix_books_category'), table_name='books') + op.drop_table('books') + # ### end Alembic commands ### diff --git a/src/items/BookItem.py b/src/items/BookItem.py new file mode 100644 index 0000000000000000000000000000000000000000..2387741f6dc7aef0ecbca30d28d318160ab2ecb3 --- /dev/null +++ b/src/items/BookItem.py @@ -0,0 +1,30 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class BookItem(scrapy.Item): + url = scrapy.Field() + + title = scrapy.Field() + description = scrapy.Field() + image_urls = scrapy.Field() + images = scrapy.Field() + rating = scrapy.Field() + + upc = scrapy.Field() + product_type = scrapy.Field() + price_excl_tax = scrapy.Field() + price_incl_tax = scrapy.Field() + tax = scrapy.Field() + in_stock = scrapy.Field() + count_reviews = scrapy.Field() + + category = scrapy.Field() + + currency_type = scrapy.Field() diff --git a/src/items/__init__.py b/src/items/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ebd689ac51d69c5e1dbbe80083c2b20a39f8bb79 --- /dev/null +++ b/src/items/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/src/middlewares/__init__.py b/src/middlewares/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ebd689ac51d69c5e1dbbe80083c2b20a39f8bb79 --- /dev/null +++ b/src/middlewares/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/src/middlewares/middlewares.py b/src/middlewares/middlewares.py new file mode 100644 index 0000000000000000000000000000000000000000..79182cb4c6c530f8b6400168df7ef7a5d7e2b3a5 --- /dev/null +++ b/src/middlewares/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class TutorialSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Response, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class TutorialDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/src/pipelines/DatabaseWriter.py b/src/pipelines/DatabaseWriter.py new file mode 100644 index 0000000000000000000000000000000000000000..36bf75009bdce06a6663bb06fbd051a34818dbff --- /dev/null +++ b/src/pipelines/DatabaseWriter.py @@ -0,0 +1,8 @@ + + +class DatabaseWriter: + name = 'DatabaseWriter' + + def process_item(self, item, spider): + spider.save_book(item) + return item diff --git a/src/pipelines/StatsMailer.py b/src/pipelines/StatsMailer.py new file mode 100644 index 0000000000000000000000000000000000000000..1eec9e476d54a7176d816f5bc62c727b42e93cd1 --- /dev/null +++ b/src/pipelines/StatsMailer.py @@ -0,0 +1,14 @@ +from scrapy.mail import MailSender +from scrapy.conf import settings + + +class StatsMailer(object): + name = 'StatsMailer' + + def close_spider(self, spider): + mailer = MailSender.from_settings(settings) + to = [settings.get('MAIL_FROM')] + subject = 'Parser name:{} finished'.format(spider.name) + body = subject + ':\n\n' + body += '\n'.join(['{}: {}'.format(k, v) for k, v in spider.crawler.stats.get_stats().items()]) + return mailer.send(to=to, subject=subject, body=body) diff --git a/src/pipelines/StatsWriter.py b/src/pipelines/StatsWriter.py new file mode 100644 index 0000000000000000000000000000000000000000..da2042e549ff7ce174a39cb02794d172881496d5 --- /dev/null +++ b/src/pipelines/StatsWriter.py @@ -0,0 +1,10 @@ + + +class StatsWriter(object): + name = 'StatsWriter' + + def close_spider(self, spider): + stats = spider.crawler.stats.get_stats() + with open('../statistics.txt', 'w') as f: + for k, v in stats.items(): + f.writelines('{}: {}\n'.format(k, v)) diff --git a/src/pipelines/__init__.py b/src/pipelines/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ebd689ac51d69c5e1dbbe80083c2b20a39f8bb79 --- /dev/null +++ b/src/pipelines/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/src/scrapy.cfg b/src/scrapy.cfg new file mode 100644 index 0000000000000000000000000000000000000000..5bb0b9f49f160a3dc3f1bc481fa0bc2a1c2fe50b --- /dev/null +++ b/src/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = settings + +[deploy] +#url = http://localhost:6800/ +project = tutorial diff --git a/src/settings.py b/src/settings.py new file mode 100644 index 0000000000000000000000000000000000000000..782dd67d2a5c6645fe778c2abfa1466cc6a6de10 --- /dev/null +++ b/src/settings.py @@ -0,0 +1,125 @@ +# -*- coding: utf-8 -*- +import logging +from dotenv import load_dotenv +import os +true_list = ['True', 'true', 'TRUE', 1, '1'] +load_dotenv() + +# Scrapy settings for tutorial project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://doc.scrapy.org/en/latest/topics/settings.html +# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'tutorial' + +SPIDER_MODULES = ['spiders'] +NEWSPIDER_MODULE = 'spiders' + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +# USER_AGENT = 'tutorial (+http://www.yourdomain.com)' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +# CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +# DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +# CONCURRENT_REQUESTS_PER_DOMAIN = 16 +# CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +# COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +# TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +# DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +# } + +# Enable or disable spider middlewares +# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html +# SPIDER_MIDDLEWARES = { +# 'tutorial.middlewares.TutorialSpiderMiddleware': 543, +# } + +# Enable or disable downloader middlewares +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +# DOWNLOADER_MIDDLEWARES = { +# 'tutorial.middlewares.TutorialDownloaderMiddleware': 543, +# } + +# Enable or disable extensions +# See https://doc.scrapy.org/en/latest/topics/extensions.html +# EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +# } + +# Configure item pipelines +# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html +IMAGES_STORE = '../image' +ITEM_PIPELINES = { + 'pipelines.DatabaseWriter.DatabaseWriter': 101, + 'pipelines.StatsMailer.StatsMailer': 999, + 'pipelines.StatsWriter.StatsWriter': 1000, + 'scrapy.pipelines.images.ImagesPipeline': 1, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/autothrottle.html +# AUTOTHROTTLE_ENABLED = True +# The initial download delay +# AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +# AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +# AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = 'httpcache' +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +CONNECTION_STRING = "{drivername}://{user}:{passwd}@{host}:{port}/{db_name}?charset=utf8".format( + drivername=os.getenv('DB_CONNECTION'), + user=os.getenv('DB_USERNAME'), + passwd=os.getenv('DB_PASSWORD'), + host=os.getenv('DB_HOST'), + port=os.getenv('DB_PORT'), + db_name=os.getenv('DB_DATABASE'), +) + +# Logger config +LOG_FILE = os.getenv('LOG_FILE', 'log.log') +LOG_ENABLED = os.getenv('LOG_ENABLED', 'True') in true_list +LOG_LEVEL = logging._nameToLevel.get(os.getenv('LOG_LEVEL', 'INFO')) +LOG_FORMAT = os.getenv('LOG_FORMAT', '%(asctime)s - %(levelname)s - %(name)s : \n %(message)s') +LOG_STDOUT = os.getenv('LOG_STDOUT', 'False') in true_list +LOG_SHORT_NAMES = False +# LOG_DATEFORMAT + +# Mail config +MAIL_FROM = os.getenv('MAIL_FROM', 'scrapy@localhost') +MAIL_HOST = os.getenv('MAIL_HOST') +MAIL_PORT = int(os.getenv('MAIL_PORT', 465)) +MAIL_USER = os.getenv('MAIL_USER') +MAIL_PASS = os.getenv('MAIL_PASS') +MAIL_TLS = os.getenv('MAIL_TLS', 'False') in true_list +MAIL_SSL = os.getenv('MAIL_SSL', 'True') in true_list diff --git a/src/spiders/__init__.py b/src/spiders/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ebd689ac51d69c5e1dbbe80083c2b20a39f8bb79 --- /dev/null +++ b/src/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/src/spiders/books_spider.py b/src/spiders/books_spider.py new file mode 100644 index 0000000000000000000000000000000000000000..e14ce74d2cd4020291a64f31c70a444e0caf5547 --- /dev/null +++ b/src/spiders/books_spider.py @@ -0,0 +1,59 @@ +from scrapy import Spider +import re +from items.BookItem import BookItem +from database.connectors.BookConnector import BookConnector + + +class BooksSpider(Spider, BookConnector): + name = 'books' + start_urls = ['http://books.toscrape.com/'] + page = 1 + + def parse(self, response): + self.logger.debug('Current page: {}'.format(self.page)) + + # follow links to book pages + for idx, href in enumerate(response.css('div.image_container a::attr(href)')): + yield response.follow(href, self.parse_book, meta={'idx': idx}) + + # pagination + next_page = response.css('li.next a::attr(href)').get() + if next_page is not None: + self.page += 1 + yield response.follow(next_page, callback=self.parse) + + def parse_book(self, response): + self.logger.debug('Index book in page: {}'.format(response.meta.get('idx'))) + + book = BookItem() + + table_data = response.css('table td::text').getall() + book.update({ + 'title': response.css('div.product_main h1::text').get(), + 'description': response.xpath("//*[@id='product_description']/following::p/text()").get(), + 'image_urls': [response.urljoin(response.css('div.active img::attr(src)').get())], + 'rating': self._get_rating(response.css('p.star-rating::attr(class)').get().split(' ')), + 'upc': table_data[0], + 'product_type': table_data[1], + 'price_excl_tax': table_data[2][1:], + 'price_incl_tax': table_data[3][1:], + 'tax': table_data[4][1:], + 'in_stock': int(re.search(r'\d+', table_data[5]).group()), + 'count_reviews': int(table_data[6]), + 'currency_type': table_data[2][0], + 'category': response.css('ul.breadcrumb li:nth-child(3) a::text').get(), + 'url': response.url, + }) + yield book + + def _get_rating(self, class_all): + if 'One' in class_all: + return 1 + if 'Two' in class_all: + return 2 + if 'Three' in class_all: + return 3 + if 'Four' in class_all: + return 4 + if 'Five' in class_all: + return 5 \ No newline at end of file diff --git a/src/utils/Singleton.py b/src/utils/Singleton.py new file mode 100644 index 0000000000000000000000000000000000000000..0ef78444aab817558f3fbc3a75c525bb32c9553c --- /dev/null +++ b/src/utils/Singleton.py @@ -0,0 +1,9 @@ + + +class Singleton(type): + _instances = {} + + def __call__(cls, *args, **kwargs): + if cls not in cls._instances: + cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs) + return cls._instances[cls] \ No newline at end of file diff --git a/src/utils/__init__.py b/src/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/statistics.txt b/statistics.txt new file mode 100644 index 0000000000000000000000000000000000000000..fb6d6f9079d8266651c5e9622aa3a97ac8972d87 --- /dev/null +++ b/statistics.txt @@ -0,0 +1,20 @@ +start_time: 2019-06-14 07:02:37.424266 +scheduler/enqueued/memory: 1050 +scheduler/enqueued: 1050 +scheduler/dequeued/memory: 1050 +scheduler/dequeued: 1050 +downloader/request_count: 2051 +downloader/request_method_count/GET: 2051 +downloader/request_bytes: 623523 +robotstxt/request_count: 1 +downloader/response_count: 2051 +downloader/response_status_count/404: 1 +downloader/response_bytes: 46033776 +response_received_count: 2051 +robotstxt/response_count: 1 +robotstxt/response_status_count/404: 1 +downloader/response_status_count/200: 2050 +request_depth_max: 50 +file_count: 1000 +file_status_count/downloaded: 1000 +item_scraped_count: 1000