Merge pull request #244656 from jokatzke/trafilatura
python3Packages.trafilatura: init at 1.6.3
This commit is contained in:
commit
bc48aabfa7
|
@ -9162,6 +9162,12 @@
|
|||
fingerprint = "7249 70E6 A661 D84E 8B47 678A 0590 93B1 A278 BCD0";
|
||||
}];
|
||||
};
|
||||
jokatzke = {
|
||||
email = "jokatzke@fastmail.com";
|
||||
github = "jokatzke";
|
||||
githubId = 46931073;
|
||||
name = "Jonas Katzke";
|
||||
};
|
||||
joko = {
|
||||
email = "ioannis.koutras@gmail.com";
|
||||
github = "jokogr";
|
||||
|
|
54
pkgs/development/python-modules/courlan/default.nix
Normal file
54
pkgs/development/python-modules/courlan/default.nix
Normal file
|
@ -0,0 +1,54 @@
|
|||
{ lib
|
||||
, buildPythonPackage
|
||||
, fetchPypi
|
||||
, langcodes
|
||||
, pytestCheckHook
|
||||
, tld
|
||||
, urllib3
|
||||
, pythonOlder
|
||||
}:
|
||||
|
||||
buildPythonPackage rec {
|
||||
pname = "courlan";
|
||||
version = "0.9.5";
|
||||
format = "setuptools";
|
||||
|
||||
disabled = pythonOlder "3.6";
|
||||
|
||||
src = fetchPypi {
|
||||
inherit pname version;
|
||||
hash = "sha256-ONw1suO/H11RbQDVGsEuveVD40F8a+b2oic8D8W1s1M=";
|
||||
};
|
||||
|
||||
propagatedBuildInputs = [
|
||||
langcodes
|
||||
tld
|
||||
urllib3
|
||||
];
|
||||
|
||||
nativeCheckInputs = [
|
||||
pytestCheckHook
|
||||
];
|
||||
|
||||
# disable tests that require an internet connection
|
||||
disabledTests = [
|
||||
"test_urlcheck"
|
||||
];
|
||||
|
||||
# nixify path to the courlan binary in the test suite
|
||||
postPatch = ''
|
||||
substituteInPlace tests/unit_tests.py \
|
||||
--replace "\"courlan --help\"" "\"$out/bin/courlan --help\"" \
|
||||
--replace "courlan_bin = \"courlan\"" "courlan_bin = \"$out/bin/courlan\""
|
||||
'';
|
||||
|
||||
pythonImportsCheck = [ "courlan" ];
|
||||
|
||||
meta = with lib; {
|
||||
description = "Clean, filter and sample URLs to optimize data collection";
|
||||
homepage = "https://github.com/adbar/courlan";
|
||||
changelog = "https://github.com/adbar/courlan/blob/v${version}/HISTORY.md";
|
||||
license = licenses.gpl3Plus;
|
||||
maintainers = with maintainers; [ jokatzke ];
|
||||
};
|
||||
}
|
56
pkgs/development/python-modules/htmldate/default.nix
Normal file
56
pkgs/development/python-modules/htmldate/default.nix
Normal file
|
@ -0,0 +1,56 @@
|
|||
{ lib
|
||||
, buildPythonPackage
|
||||
, fetchPypi
|
||||
, pythonOlder
|
||||
, charset-normalizer
|
||||
, dateparser
|
||||
, lxml
|
||||
, pytestCheckHook
|
||||
, python-dateutil
|
||||
, urllib3
|
||||
, backports-datetime-fromisoformat
|
||||
}:
|
||||
|
||||
buildPythonPackage rec {
|
||||
pname = "htmldate";
|
||||
version = "1.6.0";
|
||||
format = "setuptools";
|
||||
|
||||
disabled = pythonOlder "3.6";
|
||||
|
||||
src = fetchPypi {
|
||||
inherit pname version;
|
||||
hash = "sha256-WCfI9iahaACinlfoGIo9MtCwjKTHvWYlN7c7u/IsRaY=";
|
||||
};
|
||||
|
||||
propagatedBuildInputs = [
|
||||
charset-normalizer
|
||||
dateparser
|
||||
lxml
|
||||
python-dateutil
|
||||
urllib3
|
||||
] ++ lib.optionals (pythonOlder "3.7") [
|
||||
backports-datetime-fromisoformat
|
||||
];
|
||||
|
||||
nativeCheckInputs = [
|
||||
pytestCheckHook
|
||||
];
|
||||
|
||||
# disable tests that require an internet connection
|
||||
disabledTests = [
|
||||
"test_input"
|
||||
"test_cli"
|
||||
"test_download"
|
||||
];
|
||||
|
||||
pythonImportsCheck = [ "htmldate" ];
|
||||
|
||||
meta = with lib; {
|
||||
description = "Fast and robust extraction of original and updated publication dates from URLs and web pages";
|
||||
homepage = "https://htmldate.readthedocs.io";
|
||||
changelog = "https://github.com/adbar/htmldate/blob/v${version}/CHANGELOG.md";
|
||||
license = licenses.gpl3Plus;
|
||||
maintainers = with maintainers; [ jokatzke ];
|
||||
};
|
||||
}
|
43
pkgs/development/python-modules/justext/default.nix
Normal file
43
pkgs/development/python-modules/justext/default.nix
Normal file
|
@ -0,0 +1,43 @@
|
|||
{ lib
|
||||
, buildPythonPackage
|
||||
, fetchFromGitHub
|
||||
, pytestCheckHook
|
||||
, lxml
|
||||
}:
|
||||
|
||||
buildPythonPackage rec {
|
||||
pname = "justext";
|
||||
version = "3.0.0";
|
||||
format = "setuptools";
|
||||
|
||||
src = fetchFromGitHub {
|
||||
owner = "miso-belica";
|
||||
repo = "jusText";
|
||||
rev = "refs/tags/v${version}";
|
||||
hash = "sha256-WNxDoM5666tEHS9pMl5dOoig4S7dSYaCLZq71tehWqw=";
|
||||
};
|
||||
|
||||
propagatedBuildInputs = [
|
||||
lxml
|
||||
];
|
||||
|
||||
nativeCheckInputs = [
|
||||
pytestCheckHook
|
||||
];
|
||||
|
||||
# patch out coverage report
|
||||
postPatch = ''
|
||||
substituteInPlace setup.cfg \
|
||||
--replace " --cov=justext --cov-report=term-missing --no-cov-on-fail" ""
|
||||
'';
|
||||
|
||||
pythonImportsCheck = [ "justext" ];
|
||||
|
||||
meta = with lib; {
|
||||
description = "Heuristic based boilerplate removal tool";
|
||||
homepage = "https://github.com/miso-belica/jusText";
|
||||
changelog = "https://github.com/miso-belica/jusText/blob/v${version}/CHANGELOG.rst";
|
||||
license = licenses.bsd2;
|
||||
maintainers = with maintainers; [ jokatzke ];
|
||||
};
|
||||
}
|
43
pkgs/development/python-modules/py3langid/default.nix
Normal file
43
pkgs/development/python-modules/py3langid/default.nix
Normal file
|
@ -0,0 +1,43 @@
|
|||
{ lib
|
||||
, buildPythonPackage
|
||||
, fetchPypi
|
||||
, pythonOlder
|
||||
, numpy
|
||||
, pytestCheckHook
|
||||
}:
|
||||
|
||||
buildPythonPackage rec {
|
||||
pname = "py3langid";
|
||||
version = "0.2.2";
|
||||
format = "setuptools";
|
||||
|
||||
disabled = pythonOlder "3.6";
|
||||
|
||||
src = fetchPypi {
|
||||
inherit pname version;
|
||||
hash = "sha256-tN4B2tfnAfKdIWoJNeheCWzIZ1kD0j6oRFsrtfCQuW8=";
|
||||
};
|
||||
|
||||
propagatedBuildInputs = [
|
||||
numpy
|
||||
];
|
||||
|
||||
nativeCheckInputs = [
|
||||
pytestCheckHook
|
||||
];
|
||||
|
||||
# nixify path to the courlan binary in the test suite
|
||||
postPatch = ''
|
||||
substituteInPlace tests/test_langid.py --replace "'langid'" "'$out/bin/langid'"
|
||||
'';
|
||||
|
||||
pythonImportsCheck = [ "py3langid" ];
|
||||
|
||||
meta = with lib; {
|
||||
description = "Fork of the language identification tool langid.py, featuring a modernized codebase and faster execution times";
|
||||
homepage = "https://github.com/adbar/py3langid";
|
||||
changelog = "https://github.com/adbar/py3langid/blob/v${version}/HISTORY.rst";
|
||||
license = licenses.bsd3;
|
||||
maintainers = with maintainers; [ jokatzke ];
|
||||
};
|
||||
}
|
67
pkgs/development/python-modules/trafilatura/default.nix
Normal file
67
pkgs/development/python-modules/trafilatura/default.nix
Normal file
|
@ -0,0 +1,67 @@
|
|||
{ lib
|
||||
, buildPythonPackage
|
||||
, fetchPypi
|
||||
, pytestCheckHook
|
||||
, pythonOlder
|
||||
, certifi
|
||||
, charset-normalizer
|
||||
, courlan
|
||||
, htmldate
|
||||
, justext
|
||||
, lxml
|
||||
, urllib3
|
||||
}:
|
||||
|
||||
buildPythonPackage rec {
|
||||
pname = "trafilatura";
|
||||
version = "1.6.3";
|
||||
format = "setuptools";
|
||||
|
||||
disabled = pythonOlder "3.6";
|
||||
|
||||
src = fetchPypi {
|
||||
inherit pname version;
|
||||
hash = "sha256-Zx3W4AAOEBxLzo1w9ECLy3n8vyJ17iVZHv4z4sihYA0=";
|
||||
};
|
||||
|
||||
propagatedBuildInputs = [
|
||||
certifi
|
||||
charset-normalizer
|
||||
courlan
|
||||
htmldate
|
||||
justext
|
||||
lxml
|
||||
urllib3
|
||||
];
|
||||
|
||||
nativeCheckInputs = [ pytestCheckHook ];
|
||||
|
||||
# disable tests that require an internet connection
|
||||
disabledTests = [
|
||||
"test_download"
|
||||
"test_fetch"
|
||||
"test_redirection"
|
||||
"test_meta_redirections"
|
||||
"test_crawl_page"
|
||||
"test_whole"
|
||||
"test_probing"
|
||||
"test_cli_pipeline"
|
||||
];
|
||||
|
||||
# patch out gui cli because it is not supported in this packaging
|
||||
# nixify path to the trafilatura binary in the test suite
|
||||
postPatch = ''
|
||||
substituteInPlace setup.py --replace '"trafilatura_gui=trafilatura.gui:main",' ""
|
||||
substituteInPlace tests/cli_tests.py --replace "trafilatura_bin = 'trafilatura'" "trafilatura_bin = '$out/bin/trafilatura'"
|
||||
'';
|
||||
|
||||
pythonImportsCheck = [ "trafilatura" ];
|
||||
|
||||
meta = with lib; {
|
||||
description = "Python package and command-line tool designed to gather text on the Web";
|
||||
homepage = "https://trafilatura.readthedocs.io";
|
||||
changelog = "https://github.com/adbar/trafilatura/blob/v${version}/HISTORY.md";
|
||||
license = licenses.gpl3Plus;
|
||||
maintainers = with maintainers; [ jokatzke ];
|
||||
};
|
||||
}
|
|
@ -2397,6 +2397,8 @@ self: super: with self; {
|
|||
qemu = pkgs.qemu;
|
||||
};
|
||||
|
||||
courlan = callPackage ../development/python-modules/courlan { };
|
||||
|
||||
cov-core = callPackage ../development/python-modules/cov-core { };
|
||||
|
||||
coverage = callPackage ../development/python-modules/coverage { };
|
||||
|
@ -5280,6 +5282,8 @@ self: super: with self; {
|
|||
|
||||
html5-parser = callPackage ../development/python-modules/html5-parser { };
|
||||
|
||||
htmldate = callPackage ../development/python-modules/htmldate { };
|
||||
|
||||
htmllaundry = callPackage ../development/python-modules/htmllaundry { };
|
||||
|
||||
htmllistparse = callPackage ../development/python-modules/htmllistparse { };
|
||||
|
@ -6057,6 +6061,8 @@ self: super: with self; {
|
|||
|
||||
justbytes = callPackage ../development/python-modules/justbytes { };
|
||||
|
||||
justext = callPackage ../development/python-modules/justext { };
|
||||
|
||||
justnimbus = callPackage ../development/python-modules/justnimbus { };
|
||||
|
||||
jwcrypto = callPackage ../development/python-modules/jwcrypto { };
|
||||
|
@ -10085,6 +10091,8 @@ self: super: with self; {
|
|||
|
||||
py3exiv2 = callPackage ../development/python-modules/py3exiv2 { };
|
||||
|
||||
py3langid = callPackage ../development/python-modules/py3langid { };
|
||||
|
||||
py3nvml = callPackage ../development/python-modules/py3nvml { };
|
||||
|
||||
py3rijndael = callPackage ../development/python-modules/py3rijndael { };
|
||||
|
@ -14668,6 +14676,8 @@ self: super: with self; {
|
|||
|
||||
trackpy = callPackage ../development/python-modules/trackpy { };
|
||||
|
||||
trafilatura = callPackage ../development/python-modules/trafilatura { };
|
||||
|
||||
trailrunner = callPackage ../development/python-modules/trailrunner {};
|
||||
|
||||
trainer = callPackage ../development/python-modules/trainer {};
|
||||
|
|
Loading…
Reference in New Issue
Block a user