I'm using Scrapy's built-in CrawlOnce middleware, and when I stop and restart a job, I sometimes get this error:
Traceback (most recent call last):
File "/app/tasks/crawl.py", line 250, in crawl
crawler_proc.start()
File "/usr/local/lib/python3.9/site-packages/scrapy/crawler.py", line 346, in start
reactor.run(installSignalHandlers=False) # blocking call
File "/usr/local/lib/python3.9/site-packages/twisted/internet/base.py", line 1318, in run
self.mainLoop()
File "/usr/local/lib/python3.9/site-packages/twisted/internet/base.py", line 1328, in mainLoop
reactorBaseSelf.runUntilCurrent()
--- <exception caught here> ---
File "/usr/local/lib/python3.9/site-packages/twisted/internet/base.py", line 994, in runUntilCurrent
call.func(*call.args, **call.kw)
File "/usr/local/lib/python3.9/site-packages/scrapy/utils/reactor.py", line 51, in __call__
return self._func(*self._a, **self._kw)
File "/usr/local/lib/python3.9/site-packages/scrapy/core/engine.py", line 157, in _next_request
self.crawl(request)
File "/usr/local/lib/python3.9/site-packages/scrapy/core/engine.py", line 247, in crawl
self._schedule_request(request, self.spider)
File "/usr/local/lib/python3.9/site-packages/scrapy/core/engine.py", line 252, in _schedule_request
if not self.slot.scheduler.enqueue_request(request): # type: ignore[union-attr]
File "/usr/local/lib/python3.9/site-packages/scrapy/core/scheduler.py", line 241, in enqueue_request
dqok = self._dqpush(request)
File "/usr/local/lib/python3.9/site-packages/scrapy/core/scheduler.py", line 280, in _dqpush
self.dqs.push(request)
File "/usr/local/lib/python3.9/site-packages/scrapy/pqueues.py", line 89, in push
self.queues[priority] = self.qfactory(priority)
File "/usr/local/lib/python3.9/site-packages/scrapy/pqueues.py", line 76, in qfactory
return create_instance(
File "/usr/local/lib/python3.9/site-packages/scrapy/utils/misc.py", line 166, in create_instance
instance = objcls.from_crawler(crawler, *args, **kwargs)
File "/usr/local/lib/python3.9/site-packages/scrapy/squeues.py", line 68, in from_crawler
return cls(crawler, key)
File "/usr/local/lib/python3.9/site-packages/scrapy/squeues.py", line 64, in __init__
super().__init__(key)
File "/usr/local/lib/python3.9/site-packages/scrapy/squeues.py", line 23, in __init__
super().__init__(path, *args, **kwargs)
File "/usr/local/lib/python3.9/site-packages/queuelib/queue.py", line 208, in __init__
(self.size,) = struct.unpack(self.SIZE_FORMAT, qsize)
struct.error: unpack requires a buffer of 4 bytes
Deleting the request queue fixes it, but this causes another issue: If pagination exists in the request flow, stopping the spider at an intermediate paginated URL and then deleting the queue results in losing the pagination state. Since my crawler starts only with base URLs provided in an input file, it loses track of subsequent pagination requests that would have been generated dynamically.
发布者:admin,转转请注明出处:http://www.yc00.com/questions/1744146677a4560456.html
评论列表(0条)