I have written a spider in scrapy to crawler hundreds of thousands of pages from some news webistes. It works well when I start it from command line tools and the memory usage reach a stable level of 20% on my 4GB PC. (I have used priority on requests to ensure there won't be tooo many requests alive.) But when I start it from a python script , the usage of memory continue to grow until my spider eat up all the memory space. This is my start-up script:
class CrawlersInitiator(object):
def __init__(self, spiders, start=datetime.now()-timedelta(minutes=30), end=datetime.now()):
self.setting = get_project_settings()
self.crawlers = []
self.spiders = spiders
self.start_time = start
self.end_time = end
# log file
self.info_log = None
log_dir = self.setting.get("LOG_DIR")
if not os.path.exists(log_dir):
os.mkdir(log_dir)
# counter used to stop reactor
self.stopped_crawler = 0
self.lock = RLock()
def __del__(self):
self.close_log_file()
def create_log_file(self):
""" create log file with crawl date in file name
"""
self.close_log_file()
dir_path = self.setting.get("LOG_DIR")+"/{0}".format(self.end_time.strftime("%Y-%m"))
file_suffix = self.end_time.strftime("%Y-%m-%d")
if not os.path.exists(dir_path):
os.mkdir(dir_path)
self.info_log = open("{0}/log-{1}.log".format(dir_path, file_suffix), "a") # info
def close_log_file(self):
if self.info_log and not self.info_log.closed:
self.info_log.close()
self.info_log = None
def get_crawler(self, spider):
crawler = Crawler(self.setting)
crawler.signals.connect(self.stop, signal=signals.spider_closed)
crawler.configure()
crawler.crawl(spider(start_time=self.start_time, end_time=self.end_time))
return crawler
def stop(self):
"""callback to stop reactor
"""
self.lock.acquire()
self.stopped_crawler += 1
if self.stopped_crawler >= len(self.crawlers):
reactor.stop()
self.lock.release()
def run_spiders(self):
"""run spiders
"""
self.crawlers = []
self.stopped_crawler = 0
# get crawlers
for Spider in self.spiders:
self.crawlers.append(self.get_crawler(Spider))
# log
self.create_log_file()
ScrapyFileLogObserver(self.info_log, level=log.INFO).start()
self.info_log.write("\nCrawlers starting...\n")
self.info_log.write("Crawl from {0} to {1}".format(str(self.start_time), str(self.end_time)))
# run
for crawler in self.crawlers:
crawler.start()
reactor.run()
end = datetime.now()
# release crawlers
for crawler in self.crawlers:
del crawler
# log
self.info_log.write("Crawlers finished in {0} !\n".format(str(end-self.end_time)))
self.close_log_file()
def crawl(spiders, start, end):
CrawlersInitiator(spiders, start=start, end=end).run_spiders()
SPIDERS = [MySpider1, MySpider2]
if __name__ == "__main__":
start_time = datetime.strptime(sys.argv[1], "%Y-%m-%d_%H:%M:%S")
end_time = datetime.strptime(sys.argv[2], "%Y-%m-%d_%H:%M:%S")
crawl(SPIDERS, start_time, end_time)
quit()
I have tried to use scrapy trackref to find the problem.
When start from command line tools, the prefs() shows(Only one spider started):
MySpider1 1 oldest: 942s ago
HtmlResponse 13 oldest: 52s ago
Request 6329 oldest: 932s ago
Item 5915 oldest: 932s ago
Selector 13 oldest: 52s ago
When start from script, the prefs() shows:
Response 51 oldest: 657s ago
Request 6966 oldest: 661s ago
Item 5732 oldest: 661s ago
HtmlResponse 377 oldest: 661s ago
Selector 377 oldest: 661s ago
MySpider1 1 oldest: 661s ago
It looks like scrapy never release any objects when strated from my script. Why this happen and how to solve it?
Here is the superclass of the all my spiders, all of the requests are processed in this class:
class IndexSpider(Spider):
__metaclass__ = ABCMeta
# splice _queries onto _search_url to get start_requests (index pages of news)
_search_url = ""
_queries = []
_char_set = "utf8"
def __init__(self, queries=self._queries, start_time=datetime.min, end_time=datetime.now()):
self.queries = queries
self.start_time = start_time
self.end_time = end_time
def start_requests(self):
query_count = 0
query = None
try:
for query in self.queries:
yield Request(self._search_url.format(urllib.quote(query.encode(self._char_set))), self.parse_index)
query_count += 1
except Exception, e:
self.log("Query No.{0} can't be encoded in {1}, because of {2}!"
.format(str(query_count), self.name, e), level=log.WARNING)
yield Request(self._search_url.format(query.encode("gbk")), self.parse_index)
def parse_index(self, response):
"""parse index page
"""
requests = []
page_list = self._get_result(response)
if not page_list:
return requests
next_page = True
for item in page_list:
if isinstance(item, Request):
requests.append(item)
next_page = False
break
if item['publish_time'] <= self.from_time:
next_page = False
break
elif item['publish_time'] > self.end_time:
continue
else:
req = Request(item['url'], self.parse_news, priority=1)
req.meta["item"] = item
requests.append(req)
if next_page:
next_page = self._next_index_page(response)
if next_page:
requests.append(Request(self._next_index_page(response), self.parse_index))
return requests
def parse_news(self, response):
"""parse news page
"""
item = response.meta["item"]
del response.meta['item']
return self._finish_item(item, response)
@abstractmethod
def _get_result(self, response):
"""get news list from index page
:param response: index page
:return: a list of objects of crawlers.items.Base or its subclass, each object represents a news
"""
pass
@abstractmethod
def _next_index_page(self, response):
"""
:param response: current index page
:return: URL of the next index page
"""
pass
@abstractmethod
def _finish_item(self, item, response):
"""parse news page
:param item: news item get from the index page
:param response: news page
:return: news item or new request
"""
pass
maybe scrapy isn't using the whole power of your computer when running from a script. You could still use Settings to control the spider behaviour:
my_settings = {
'MEMUSAGE_ENABLED' = 1,
'MEMUSAGE_LIMIT_MB' = 1024,
'CONCURRENT_REQUESTS' = 100 # lower this if it is still reaching memory limits
}
process = CrawlerProcess(my_settings)
process.crawl(MySpider)
process.start()
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With