scripts/init-feed: support --podcast argument to select podcasts over text

This commit is contained in:
2024-08-16 02:34:19 +00:00
parent 729d2a9809
commit f603bad779
2 changed files with 70 additions and 17 deletions

View File

@@ -4,14 +4,12 @@
from feedsearch_crawler import search, sort_urls
from feedsearch_crawler.crawler import coerce_url
import argparse
import json
import logging
import sys
url, jsonPath = sys.argv[1:]
logging.getLogger().setLevel(logging.DEBUG)
logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
logging.getLogger(__name__).debug("logging enabled")
logger = logging.getLogger(__name__)
def try_scheme(url: str, scheme: str):
url = coerce_url(url, default_scheme=scheme)
@@ -41,16 +39,55 @@ def clean_item(item: dict) -> dict:
]
}
items = try_scheme(url, "https") or try_scheme(url, "http")
def try_load_existing_feed(path_: str) -> dict:
try:
f = open(path_, "r")
except:
return {}
else:
return json.loads(f.read())
# print all results
serialized = [item.serialize() for item in items]
serialized = [clean_item(s) for s in serialized]
for item in serialized:
print(json.dumps(item, sort_keys=True, indent=2))
def select_feed(feeds: list[dict], prefer_podcast: bool) -> dict:
feeds = sorted(feeds, key=lambda f: (
(not f.get("is_podcast", not prefer_podcast) == prefer_podcast), #< prefer the resuested media format
-f.get("velocity", 0), #< prefer higher-velocity sources
f["url"], #< prefer shorter URLs
len(f["title"]) if f.get("title", "") != "" else 1000, #< prefer shorter titles
f,
))
return feeds[0] if len(feeds) else {}
# save the first result to disk
keep = serialized[0] if serialized else {}
results = json.dumps(keep, sort_keys=True, indent=2)
with open(jsonPath, "w") as out:
out.write(results)
def main():
logging.basicConfig()
logging.getLogger().setLevel(logging.DEBUG)
logger.debug("logging enabled")
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument("url", help="where to start searching for a feed")
parser.add_argument("output", help="where to save extracted feed data (should end in .json)")
parser.add_argument('--podcast', help="if multiple feeds are found, prefer the podcast feed over any text/image feed", action='store_true')
args = parser.parse_args()
url, json_path = args.url, args.output
existing_data = try_load_existing_feed(json_path)
prefer_podcast = args.podcast or existing_data.get("is_podcast", False)
items = try_scheme(url, "https") or try_scheme(url, "http")
# print all results
serialized = [item.serialize() for item in items]
serialized = [clean_item(s) for s in serialized]
for item in serialized:
print(json.dumps(item, sort_keys=True, indent=2))
# save the best feed to disk
keep = select_feed(serialized, prefer_podcast=prefer_podcast)
results = json.dumps(keep, sort_keys=True, indent=2)
with open(json_path, "w") as out:
out.write(results)
if __name__ == '__main__':
main()

View File

@@ -1,10 +1,26 @@
#!/usr/bin/env nix-shell
#!nix-shell -i bash -p feeds.update-feed -p gnused
source=
passthruArgs=()
for arg in "$@"; do
case $arg in
(--*)
passthruArgs+=("$arg")
;;
(*)
if [ -z "$source" ]; then
source="$arg"
else
passthruArgs+=("$arg")
fi
esac
done
sources_dir=modules/data/feeds/sources
# prettify the URL, by default
name=$( \
echo "$1" \
echo "$source" \
| sed 's|^https://||' \
| sed 's|^http://||' \
| sed 's|^www\.||' \
@@ -17,5 +33,5 @@ json_path="$sources_dir/$name/default.json"
pushd "$sources_dir"; mkdir -p "$name"; popd
# update.py: from `feeds.update-feed` (TODO: rename the binary!)
update.py "$name" "$json_path"
update.py "$name" "$json_path" "${passthruArgs[@]}"
cat "$json_path"