scripts/init-feed: support --podcast
argument to select podcasts over text
This commit is contained in:
@@ -4,14 +4,12 @@
|
||||
from feedsearch_crawler import search, sort_urls
|
||||
from feedsearch_crawler.crawler import coerce_url
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
url, jsonPath = sys.argv[1:]
|
||||
|
||||
logging.getLogger().setLevel(logging.DEBUG)
|
||||
logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
|
||||
logging.getLogger(__name__).debug("logging enabled")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def try_scheme(url: str, scheme: str):
|
||||
url = coerce_url(url, default_scheme=scheme)
|
||||
@@ -41,16 +39,55 @@ def clean_item(item: dict) -> dict:
|
||||
]
|
||||
}
|
||||
|
||||
items = try_scheme(url, "https") or try_scheme(url, "http")
|
||||
def try_load_existing_feed(path_: str) -> dict:
|
||||
try:
|
||||
f = open(path_, "r")
|
||||
except:
|
||||
return {}
|
||||
else:
|
||||
return json.loads(f.read())
|
||||
|
||||
# print all results
|
||||
serialized = [item.serialize() for item in items]
|
||||
serialized = [clean_item(s) for s in serialized]
|
||||
for item in serialized:
|
||||
print(json.dumps(item, sort_keys=True, indent=2))
|
||||
def select_feed(feeds: list[dict], prefer_podcast: bool) -> dict:
|
||||
feeds = sorted(feeds, key=lambda f: (
|
||||
(not f.get("is_podcast", not prefer_podcast) == prefer_podcast), #< prefer the resuested media format
|
||||
-f.get("velocity", 0), #< prefer higher-velocity sources
|
||||
f["url"], #< prefer shorter URLs
|
||||
len(f["title"]) if f.get("title", "") != "" else 1000, #< prefer shorter titles
|
||||
f,
|
||||
))
|
||||
return feeds[0] if len(feeds) else {}
|
||||
|
||||
# save the first result to disk
|
||||
keep = serialized[0] if serialized else {}
|
||||
results = json.dumps(keep, sort_keys=True, indent=2)
|
||||
with open(jsonPath, "w") as out:
|
||||
out.write(results)
|
||||
def main():
|
||||
logging.basicConfig()
|
||||
logging.getLogger().setLevel(logging.DEBUG)
|
||||
logger.debug("logging enabled")
|
||||
|
||||
parser = argparse.ArgumentParser(usage=__doc__)
|
||||
parser.add_argument("url", help="where to start searching for a feed")
|
||||
parser.add_argument("output", help="where to save extracted feed data (should end in .json)")
|
||||
parser.add_argument('--podcast', help="if multiple feeds are found, prefer the podcast feed over any text/image feed", action='store_true')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
url, json_path = args.url, args.output
|
||||
|
||||
existing_data = try_load_existing_feed(json_path)
|
||||
|
||||
prefer_podcast = args.podcast or existing_data.get("is_podcast", False)
|
||||
|
||||
items = try_scheme(url, "https") or try_scheme(url, "http")
|
||||
|
||||
# print all results
|
||||
serialized = [item.serialize() for item in items]
|
||||
serialized = [clean_item(s) for s in serialized]
|
||||
for item in serialized:
|
||||
print(json.dumps(item, sort_keys=True, indent=2))
|
||||
|
||||
# save the best feed to disk
|
||||
keep = select_feed(serialized, prefer_podcast=prefer_podcast)
|
||||
results = json.dumps(keep, sort_keys=True, indent=2)
|
||||
with open(json_path, "w") as out:
|
||||
out.write(results)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
@@ -1,10 +1,26 @@
|
||||
#!/usr/bin/env nix-shell
|
||||
#!nix-shell -i bash -p feeds.update-feed -p gnused
|
||||
|
||||
source=
|
||||
passthruArgs=()
|
||||
for arg in "$@"; do
|
||||
case $arg in
|
||||
(--*)
|
||||
passthruArgs+=("$arg")
|
||||
;;
|
||||
(*)
|
||||
if [ -z "$source" ]; then
|
||||
source="$arg"
|
||||
else
|
||||
passthruArgs+=("$arg")
|
||||
fi
|
||||
esac
|
||||
done
|
||||
|
||||
sources_dir=modules/data/feeds/sources
|
||||
# prettify the URL, by default
|
||||
name=$( \
|
||||
echo "$1" \
|
||||
echo "$source" \
|
||||
| sed 's|^https://||' \
|
||||
| sed 's|^http://||' \
|
||||
| sed 's|^www\.||' \
|
||||
@@ -17,5 +33,5 @@ json_path="$sources_dir/$name/default.json"
|
||||
pushd "$sources_dir"; mkdir -p "$name"; popd
|
||||
|
||||
# update.py: from `feeds.update-feed` (TODO: rename the binary!)
|
||||
update.py "$name" "$json_path"
|
||||
update.py "$name" "$json_path" "${passthruArgs[@]}"
|
||||
cat "$json_path"
|
||||
|
Reference in New Issue
Block a user