scripts/init-feed: support --podcast
argument to select podcasts over text
This commit is contained in:
@@ -4,14 +4,12 @@
|
|||||||
from feedsearch_crawler import search, sort_urls
|
from feedsearch_crawler import search, sort_urls
|
||||||
from feedsearch_crawler.crawler import coerce_url
|
from feedsearch_crawler.crawler import coerce_url
|
||||||
|
|
||||||
|
import argparse
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import sys
|
import sys
|
||||||
url, jsonPath = sys.argv[1:]
|
|
||||||
|
|
||||||
logging.getLogger().setLevel(logging.DEBUG)
|
logger = logging.getLogger(__name__)
|
||||||
logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
|
|
||||||
logging.getLogger(__name__).debug("logging enabled")
|
|
||||||
|
|
||||||
def try_scheme(url: str, scheme: str):
|
def try_scheme(url: str, scheme: str):
|
||||||
url = coerce_url(url, default_scheme=scheme)
|
url = coerce_url(url, default_scheme=scheme)
|
||||||
@@ -41,6 +39,42 @@ def clean_item(item: dict) -> dict:
|
|||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def try_load_existing_feed(path_: str) -> dict:
|
||||||
|
try:
|
||||||
|
f = open(path_, "r")
|
||||||
|
except:
|
||||||
|
return {}
|
||||||
|
else:
|
||||||
|
return json.loads(f.read())
|
||||||
|
|
||||||
|
def select_feed(feeds: list[dict], prefer_podcast: bool) -> dict:
|
||||||
|
feeds = sorted(feeds, key=lambda f: (
|
||||||
|
(not f.get("is_podcast", not prefer_podcast) == prefer_podcast), #< prefer the resuested media format
|
||||||
|
-f.get("velocity", 0), #< prefer higher-velocity sources
|
||||||
|
f["url"], #< prefer shorter URLs
|
||||||
|
len(f["title"]) if f.get("title", "") != "" else 1000, #< prefer shorter titles
|
||||||
|
f,
|
||||||
|
))
|
||||||
|
return feeds[0] if len(feeds) else {}
|
||||||
|
|
||||||
|
def main():
|
||||||
|
logging.basicConfig()
|
||||||
|
logging.getLogger().setLevel(logging.DEBUG)
|
||||||
|
logger.debug("logging enabled")
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(usage=__doc__)
|
||||||
|
parser.add_argument("url", help="where to start searching for a feed")
|
||||||
|
parser.add_argument("output", help="where to save extracted feed data (should end in .json)")
|
||||||
|
parser.add_argument('--podcast', help="if multiple feeds are found, prefer the podcast feed over any text/image feed", action='store_true')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
url, json_path = args.url, args.output
|
||||||
|
|
||||||
|
existing_data = try_load_existing_feed(json_path)
|
||||||
|
|
||||||
|
prefer_podcast = args.podcast or existing_data.get("is_podcast", False)
|
||||||
|
|
||||||
items = try_scheme(url, "https") or try_scheme(url, "http")
|
items = try_scheme(url, "https") or try_scheme(url, "http")
|
||||||
|
|
||||||
# print all results
|
# print all results
|
||||||
@@ -49,8 +83,11 @@ serialized = [clean_item(s) for s in serialized]
|
|||||||
for item in serialized:
|
for item in serialized:
|
||||||
print(json.dumps(item, sort_keys=True, indent=2))
|
print(json.dumps(item, sort_keys=True, indent=2))
|
||||||
|
|
||||||
# save the first result to disk
|
# save the best feed to disk
|
||||||
keep = serialized[0] if serialized else {}
|
keep = select_feed(serialized, prefer_podcast=prefer_podcast)
|
||||||
results = json.dumps(keep, sort_keys=True, indent=2)
|
results = json.dumps(keep, sort_keys=True, indent=2)
|
||||||
with open(jsonPath, "w") as out:
|
with open(json_path, "w") as out:
|
||||||
out.write(results)
|
out.write(results)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
|
@@ -1,10 +1,26 @@
|
|||||||
#!/usr/bin/env nix-shell
|
#!/usr/bin/env nix-shell
|
||||||
#!nix-shell -i bash -p feeds.update-feed -p gnused
|
#!nix-shell -i bash -p feeds.update-feed -p gnused
|
||||||
|
|
||||||
|
source=
|
||||||
|
passthruArgs=()
|
||||||
|
for arg in "$@"; do
|
||||||
|
case $arg in
|
||||||
|
(--*)
|
||||||
|
passthruArgs+=("$arg")
|
||||||
|
;;
|
||||||
|
(*)
|
||||||
|
if [ -z "$source" ]; then
|
||||||
|
source="$arg"
|
||||||
|
else
|
||||||
|
passthruArgs+=("$arg")
|
||||||
|
fi
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
sources_dir=modules/data/feeds/sources
|
sources_dir=modules/data/feeds/sources
|
||||||
# prettify the URL, by default
|
# prettify the URL, by default
|
||||||
name=$( \
|
name=$( \
|
||||||
echo "$1" \
|
echo "$source" \
|
||||||
| sed 's|^https://||' \
|
| sed 's|^https://||' \
|
||||||
| sed 's|^http://||' \
|
| sed 's|^http://||' \
|
||||||
| sed 's|^www\.||' \
|
| sed 's|^www\.||' \
|
||||||
@@ -17,5 +33,5 @@ json_path="$sources_dir/$name/default.json"
|
|||||||
pushd "$sources_dir"; mkdir -p "$name"; popd
|
pushd "$sources_dir"; mkdir -p "$name"; popd
|
||||||
|
|
||||||
# update.py: from `feeds.update-feed` (TODO: rename the binary!)
|
# update.py: from `feeds.update-feed` (TODO: rename the binary!)
|
||||||
update.py "$name" "$json_path"
|
update.py "$name" "$json_path" "${passthruArgs[@]}"
|
||||||
cat "$json_path"
|
cat "$json_path"
|
||||||
|
Reference in New Issue
Block a user