Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

How to run Scrapy in a while loop

So Im doing a project scraping different websites using multiple spiders. I want to make it so that the spiders run again when the user says "Yes" when asked to continue.

keyword = input("enter keyword: ")
page_range = input("enter page range: ")

flag = True

while flag:

   process = CrawlProcess()
   process.crawl(crawler1, keyword, page_range)
   process.crawl(crawler2, keyword, page_range)
   process.crawl(crawler3, keyword, page_range)
   process.start()

   isContinue = input("Do you want to continue? (y/n): ")

   if isContinue == 'n':
      flag = False

But I get an error saying reactor is not restartable.

Traceback (most recent call last):
  File "/Users/user/Desktop/programs/eshopSpider/eshopSpider.py", line 47, in <module>
    process.start()
  File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/scrapy/crawler.py", line 327, in start
    reactor.run(installSignalHandlers=False)  # blocking call
  File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/twisted/internet/base.py", line 1317, in run
    self.startRunning(installSignalHandlers=installSignalHandlers)
  File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/twisted/internet/base.py", line 1299, in startRunning
    ReactorBase.startRunning(cast(ReactorBase, self))
  File "/Users/user/opt/anaconda3/lib/python3.8/site-packages/twisted/internet/base.py", line 843, in startRunning
    raise error.ReactorNotRestartable()
twisted.internet.error.ReactorNotRestartable

So I guess using while loop is no-go. I don't know where to even start...

like image 303
invisibleufo101 Avatar asked Sep 07 '25 02:09

invisibleufo101


2 Answers

Method 1:

scrapy creates Reactor which can't be reused after stop but if you will run Crawler in separated process then new process will have to create new Reactor.

import multiprocessing

def run_crawler(keyword, page_range):
   process = CrawlProcess()
   process.crawl(crawler1, keyword, page_range)
   process.crawl(crawler2, keyword, page_range)
   process.crawl(crawler3, keyword, page_range)
   process.start()

# --- main ---

keyword = input("enter keyword: ")
page_range = input("enter page range: ")

flag = True

while flag:

   p = multiprocessing(target=run_crawler, args=(keyword, page_range))
   p.start()
   p.join()

   isContinue = input("Do you want to continue? (y/n): ")

   if isContinue == 'n':
      flag = False

It will not work if you use threading instead of multiprocessing because threads share variables so new thread will use the same Reactor as previous thread.


Minimal working code (tested on Linux).

import scrapy

class MySpider(scrapy.Spider):

    name = 'myspider'

    #start_urls = ['https://books.toscrape.com/']

    def __init__(self, keyword, page, *args, **kwargs):
        '''generate start_urls list'''
        super().__init__(*args, **kwargs)
        
        self.keyword = keyword
        self.page = int(page)
        self.start_urls = [f'https://books.toscrape.com/catalogue/page-{page}.html']

    def parse(self, response):
        print('[parse] url:', response.url)

        for book in response.css('article.product_pod'):
            title = book.css('h3 a::text').get()
            url = book.css('img::attr(src)').get()
            url = response.urljoin(url)
            yield {'page': self.page, 'keyword': self.keyword, 'title': title, 'image': url}

# --- run without project and save in `output.csv` ---

import multiprocessing
from scrapy.crawler import CrawlerProcess

def run_crawler(keyword, page_range):
    #from scrapy.crawler import CrawlerProcess

    c = CrawlerProcess({
        'USER_AGENT': 'Mozilla/5.0',
        # save in file CSV, JSON or XML
        'FEEDS': {'output.csv': {'format': 'csv'}},  # new in 2.1
    })
    c.crawl(MySpider, keyword, page)
    c.crawl(MySpider, keyword, int(page)+1)
    c.crawl(MySpider, keyword, int(page)+2)
    c.start()
    
# --- main ---

if __name__ == '__main__':
    keyword = input("enter keyword: ")
    page    = input("enter page: ")
        
    running = True
    while running:

        p = multiprocessing.Process(target=run_crawler, args=(keyword, page))
        p.start()
        p.join()
        
        answer = input('Repeat [Y/n]? ').strip().lower()
        
        if answer == 'n':
            running = False

Method 2:

Found in Google: Restarting a Twisted Reactor.

It is old post which uses del to remove module twisted from memory and later it imports it again.

keyword = input("enter keyword: ")
page_range = input("enter page range: ")

flag = True

while flag:

   process = CrawlProcess()
   process.crawl(crawler1, keyword, page_range)
   process.crawl(crawler2, keyword, page_range)
   process.crawl(crawler3, keyword, page_range)
   process.start()

   isContinue = input("Do you want to continue? (y/n): ")

   if isContinue == 'n':
      flag = False
           
   import sys
   del sys.modules['twisted.internet.reactor']
   from twisted.internet import reactor
   from twisted.internet import default
   default.install()                  

Minimal working code (tested on Linux)

import scrapy

class MySpider(scrapy.Spider):

    name = 'myspider'

    #start_urls = ['https://books.toscrape.com/']

    def __init__(self, keyword, page, *args, **kwargs):
        '''generate start_urls list'''
        super().__init__(*args, **kwargs)
        
        self.keyword = keyword
        self.page = int(page)
        self.start_urls = [f'https://books.toscrape.com/catalogue/page-{page}.html']

    def parse(self, response):
        print('[parse] url:', response.url)

        for book in response.css('article.product_pod'):
            title = book.css('h3 a::text').get()
            url = book.css('img::attr(src)').get()
            url = response.urljoin(url)
            yield {'page': self.page, 'keyword': self.keyword, 'title': title, 'image': url}

# --- run without project and save in `output.csv` ---

from scrapy.crawler import CrawlerProcess

def run_crawler(keyword, page):

    c = CrawlerProcess({
        'USER_AGENT': 'Mozilla/5.0',
        # save in file CSV, JSON or XML
        'FEEDS': {'output.csv': {'format': 'csv'}},  # new in 2.1
    })
    c.crawl(MySpider, keyword, page)
    c.crawl(MySpider, keyword, int(page)+1)
    c.crawl(MySpider, keyword, int(page)+2)
    c.start()
    
# --- main ---

if __name__ == '__main__':
    keyword = input("enter keyword: ")
    page    = input("enter page: ")
        
    running = True
    while running:
    
        run_crawler(keyword, page)
        
        answer = input('Repeat [Y/n]? ').strip().lower()
        
        if answer == 'n':
            running = False
            
        import sys
        del sys.modules['twisted.internet.reactor']
        from twisted.internet import reactor
        from twisted.internet import default
        default.install()            

Method 3:

It seems you could use use CrawlRunner instead of CrawlProcess - but I didn't test it yet.

Base on last example in doc for Running multiple spiders in the same process I created code which runs while-loop inside reactor (so it doesn't have to stop it) but it first starts one Spider, next runs second Spider, next it asks for contiuation and it runs again first Spider, next runs second Spider. It doesn't runs both Spiders at the same time but maybe it could be somehow changed.

import scrapy

class MySpider(scrapy.Spider):

    name = 'myspider'

    #start_urls = ['https://books.toscrape.com/']

    def __init__(self, keyword, page, *args, **kwargs):
        '''generate start_urls list'''
        super().__init__(*args, **kwargs)
        
        self.keyword = keyword
        self.page = int(page)
        self.start_urls = [f'https://books.toscrape.com/catalogue/page-{page}.html']

    def parse(self, response):
        print('[parse] url:', response.url)

        for book in response.css('article.product_pod'):
            title = book.css('h3 a::text').get()
            url = book.css('img::attr(src)').get()
            url = response.urljoin(url)
            yield {'page': self.page, 'keyword': self.keyword, 'title': title, 'image': url}

# --- run without project and save in `output.csv` ---

from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging

@defer.inlineCallbacks
def run_crawler():

    running = True
    while running:

        yield runner.crawl(MySpider, keyword, page)
        yield runner.crawl(MySpider, keyword, int(page)+1)
        yield runner.crawl(MySpider, keyword, int(page)+2)

        answer = input('Repeat [Y/n]? ').strip().lower()
    
        if answer == 'n':
            running = False
            reactor.stop()
            #return

# --- main ---

if __name__ == '__main__':
    keyword = input("enter keyword: ")
    page    = input("enter page: ")

    configure_logging()        
    
    runner = CrawlerRunner({
        'USER_AGENT': 'Mozilla/5.0',
        # save in file CSV, JSON or XML
        'FEEDS': {'output.csv': {'format': 'csv'}},  # new in 2.1
    })

    run_crawler()

    reactor.run()     

EDIT:

The same but now all crawlers run at the same time

@defer.inlineCallbacks
def run_crawler():

    running = True
    while running:
    
        runner.crawl(MySpider, keyword, page)
        runner.crawl(MySpider, keyword, int(page)+1)
        runner.crawl(MySpider, keyword, int(page)+2)
        
        d = runner.join()
        yield d

        answer = input('Repeat [Y/n]? ').strip().lower()
    
        if answer == 'n':
            running = False
            reactor.stop()
            #return
like image 56
furas Avatar answered Sep 08 '25 21:09

furas


You can run spiders in a loop by installing the reactor at the top level before other scrapy or reactor imports, then deleting the reactor after each crawl. This worked for me:

main.py

import time
from spider_utils import run_crawler

while 1:
    run_crawler('spider1')
    run_crawler('spider2')
    time.sleep(60)

spider_utils.py

from scrapy.utils.reactor import install_reactor
install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')

from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings


def run_crawler(spider_name: str):
    """Run isolated spider and restart reactor to run another spider afterwards."""
    process = CrawlerProcess(get_project_settings())
    process.crawl(spider_name)
    process.start()

    import sys
    del sys.modules['twisted.internet.reactor']
like image 34
JTraa Avatar answered Sep 08 '25 22:09

JTraa