feedsearch-crawler: fix for very large RSS feeds (like congressionaldish)
This commit is contained in:
@@ -5,11 +5,16 @@ from feedsearch_crawler import search, sort_urls
|
||||
from feedsearch_crawler.crawler import coerce_url
|
||||
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
url, jsonPath = sys.argv[1:]
|
||||
|
||||
logging.getLogger().setLevel(logging.DEBUG)
|
||||
logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
|
||||
logging.getLogger(__name__).debug("logging enabled")
|
||||
|
||||
url = coerce_url(url, default_scheme="https")
|
||||
items = search(url)
|
||||
items = search(url, total_timeout=180, request_timeout=90, max_content_length=100*1024*1024)
|
||||
items = sort_urls(items)
|
||||
|
||||
# print all results
|
||||
|
13
pkgs/feedsearch-crawler/0001-response-chunk-size.patch
Normal file
13
pkgs/feedsearch-crawler/0001-response-chunk-size.patch
Normal file
@@ -0,0 +1,13 @@
|
||||
diff --git a/feedsearch_crawler/crawler/request.py b/feedsearch_crawler/crawler/request.py
|
||||
index 70f3d5a..00668ad 100644
|
||||
--- a/feedsearch_crawler/crawler/request.py
|
||||
+++ b/feedsearch_crawler/crawler/request.py
|
||||
@@ -277,7 +277,7 @@ class Request(Queueable):
|
||||
"""
|
||||
body: bytes = b""
|
||||
try:
|
||||
- async for chunk in resp.content.iter_chunked(1024):
|
||||
+ async for chunk in resp.content.iter_chunked(64 * 1024):
|
||||
if not chunk:
|
||||
break
|
||||
body += chunk
|
@@ -28,6 +28,12 @@ buildPythonPackage rec {
|
||||
hash = "sha256-pzvyeXzqdi8pRjk2+QjKhJfgtxbgVT6C08K9fhVFVmY=";
|
||||
};
|
||||
|
||||
patches = [
|
||||
# fix for <https://github.com/aio-libs/aiohttp/issues/4581>
|
||||
# where large feeds would timeout in an unrecoverable way
|
||||
./0001-response-chunk-size.patch
|
||||
];
|
||||
|
||||
nativeBuildInputs = [
|
||||
poetry-core
|
||||
];
|
||||
|
Reference in New Issue
Block a user