Commit 710c7cff authored by Kyryll Parolis's avatar Kyryll Parolis
Browse files

Started development of the new extension.

parent 37316031
......@@ -9,3 +9,4 @@ src/spiders/__pycache__
src/alembic/versions/__pycache__
src/alembic/__pycache__
log.txt
src/extensions/__pycache__
......@@ -572,7 +572,7 @@ test = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
[metadata]
content-hash = "0de57a97c83300aca7d39dd0d1b5d86febb73d2cde82631e4a1b61478e826bc9"
content-hash = "861cc85f60e1ed54340e51be3521fedb7bdce8e992bcae10685f9c6532a70ed7"
python-versions = "^3.5"
[metadata.files]
......
......@@ -16,3 +16,4 @@ requests = "^2.23.0"
pillow = "^7.1.2"
python-dotenv = "^0.13.0"
pylint = "^2.5.3"
twisted = "^20.3.0"
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from twisted.internet import task
from twisted.internet import reactor
from extensions.extensions import SpiderOpenCloseLogging
process = CrawlerProcess(get_project_settings())
extension = SpiderOpenCloseLogging(1000)
def cbLoopDone(result):
"""
Called when loop was stopped with success.
"""
print("Loop done.")
reactor.stop()
def ebLoopFailed(failure):
"""
Called when loop execution failed.
"""
print(failure.getBriefTraceback())
reactor.stop()
loop = task.LoopingCall(extension.check_scraped)
loopDeferred = loop.start(300.0)
loopDeferred.addCallback(cbLoopDone)
loopDeferred.addErrback(ebLoopFailed)
process.crawl("sitemap")
process.start()
reactor.run()
import logging
from scrapy import signals
import datetime
from scrapy.exceptions import NotConfigured, CloseSpider
from twisted.internet.task import LoopingCall
from twisted.internet import reactor
logger = logging.getLogger(__name__)
#
# def cbLoopDone(result):
# """
# Called when loop was stopped with success.
# """
# print("Loop done.")
# reactor.stop()
#
#
# def ebLoopFailed(failure):
# """
# Called when loop execution failed.
# """
# print(failure.getBriefTraceback())
# reactor.stop()
class SpiderOpenCloseLogging:
def __init__(self, item_count):
self.item_count = item_count
self.items_scraped = 0
self.last_items = 0
self.time = datetime.datetime.now()
@classmethod
def from_crawler(cls, crawler):
# first check if the extension should be enabled and raise
# NotConfigured otherwise
if not crawler.settings.getbool('MYEXT_ENABLED'):
raise NotConfigured
# get the number of items from settings
item_count = crawler.settings.getint('MYEXT_ITEMCOUNT', 1000)
# instantiate the extension object
ext = cls(item_count)
# connect the extension object to signals
crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped)
# return the extension object
return ext
def spider_opened(self, spider):
logger.info("opened spider %s", spider.name)
def spider_closed(self, spider):
logger.info("closed spider %s", spider.name)
def item_scraped(self, item, spider):
self.items_scraped += 1
self.last_items += 1
def check_scraped(self):
logger.info("Checking if any new items were scraped.")
if self.last_items == 0:
logger.critical("No new items were scraped.")
raise CloseSpider("No new items were scraped, during the last 5 minutes.")
else:
logger.info("New items, were scraped. Next check in 5 minutes.")
self.last_items = 0
# loop = LoopingCall(check_scraped)
# loopDeferred = loop.start(300.0)
#
# loopDeferred.addCallback(cbLoopDone)
# loopDeferred.addErrback(ebLoopFailed)
......@@ -104,3 +104,5 @@ MAIL_USER = os.getenv("MAIL_USER")
MAIL_PASS = os.getenv("MAIL_PASS")
config.fileConfig('logging.ini', defaults={'logfilename': os.getenv("LOG_FILE")})
MYEXT_ENABLED = True
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment