2023-01-10 10:53:55 +00:00
|
|
|
#!/usr/bin/env nix-shell
|
2023-01-13 08:58:20 +00:00
|
|
|
#!nix-shell -i python3 -p "python3.withPackages (ps: [ ps.feedsearch-crawler ])"
|
2023-01-10 10:53:55 +00:00
|
|
|
|
2023-01-13 08:58:20 +00:00
|
|
|
from feedsearch_crawler import search, sort_urls
|
|
|
|
from feedsearch_crawler.crawler import coerce_url
|
2023-01-10 10:53:55 +00:00
|
|
|
|
2023-01-13 08:58:20 +00:00
|
|
|
import json
|
2023-01-25 09:44:32 +00:00
|
|
|
import logging
|
2023-01-13 08:58:20 +00:00
|
|
|
import sys
|
|
|
|
url, jsonPath = sys.argv[1:]
|
2023-01-10 10:53:55 +00:00
|
|
|
|
2023-01-25 09:44:32 +00:00
|
|
|
logging.getLogger().setLevel(logging.DEBUG)
|
|
|
|
logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
|
|
|
|
logging.getLogger(__name__).debug("logging enabled")
|
|
|
|
|
2023-03-08 09:19:36 +00:00
|
|
|
def try_scheme(url: str, scheme: str):
|
|
|
|
url = coerce_url(url, default_scheme=scheme)
|
|
|
|
print(f"trying {url}")
|
|
|
|
items = search(url, total_timeout=180, request_timeout=90, max_content_length=100*1024*1024)
|
|
|
|
return sort_urls(items)
|
|
|
|
|
|
|
|
items = try_scheme(url, "https") or try_scheme(url, "http")
|
2023-01-13 08:58:20 +00:00
|
|
|
|
|
|
|
# print all results
|
|
|
|
serialized = [item.serialize() for item in items]
|
|
|
|
for item in serialized:
|
|
|
|
print(json.dumps(item, sort_keys=True, indent=2))
|
|
|
|
|
|
|
|
# save the first result to disk
|
|
|
|
keep = serialized[0] if serialized else {}
|
|
|
|
results = json.dumps(keep, sort_keys=True, indent=2)
|
|
|
|
with open(jsonPath, "w") as out:
|
|
|
|
out.write(results)
|