zytedata · wRAR · May 24, 2024 · BurnzZ · May 29, 2024 · BurnzZ
diff --git a/duplicate_url_discarder/_fingerprinter.py b/duplicate_url_discarder/_fingerprinter.py
@@ -4,7 +4,7 @@
 import os
 from typing import TYPE_CHECKING, List, Union
 
-from scrapy import Request
+from scrapy import Request, Spider, signals
 from scrapy.crawler import Crawler
 from scrapy.settings.default_settings import (
     REQUEST_FINGERPRINTER_CLASS as ScrapyRequestFingerprinter,
@@ -24,10 +24,10 @@
 class Fingerprinter:
     def __init__(self, crawler: Crawler):
         self.crawler: Crawler = crawler
-        rule_paths: List[Union[str, os.PathLike]] = self.crawler.settings.getlist(
+        self.rule_paths: List[Union[str, os.PathLike]] = self.crawler.settings.getlist(
             "DUD_LOAD_RULE_PATHS"
         )
-        if not rule_paths:
+        if not self.rule_paths:
             logger.warning("DUD_LOAD_RULE_PATHS is not set or is empty.")
         self._fallback_request_fingerprinter: RequestFingerprinterProtocol = (
             create_instance(
@@ -41,11 +41,16 @@ def __init__(self, crawler: Crawler):
                 crawler=crawler,
             )
         )
-        self.url_canonicalizer = UrlCanonicalizer(rule_paths)
+        self.url_canonicalizer = UrlCanonicalizer()
 
     @classmethod
     def from_crawler(cls, crawler: Crawler) -> Self:
-        return cls(crawler)
+        o = cls(crawler)
+        crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
+        return o
+
+    async def spider_opened(self, spider: Spider) -> None:
+        await self.url_canonicalizer.load_rules(self.rule_paths)
 
     def fingerprint(self, request: Request) -> bytes:
         if not request.meta.get("dud", True):

diff --git a/duplicate_url_discarder/url_canonicalizer.py b/duplicate_url_discarder/url_canonicalizer.py
@@ -4,6 +4,8 @@
 from pathlib import Path
 from typing import Dict, Iterable, Set, Union
 
+import treq
+from scrapy.utils.defer import maybe_deferred_to_future
 from url_matcher import URLMatcher
 
 from .processors import UrlProcessorBase, get_processor
@@ -13,11 +15,27 @@
 
 
 class UrlCanonicalizer:
-    def __init__(self, rule_paths: Iterable[Union[str, os.PathLike]]) -> None:
+    def __init__(self) -> None:
+        self.url_matcher = URLMatcher()
+        self.processors: Dict[int, UrlProcessorBase] = {}
+
+    @staticmethod
+    def _is_url(path: str) -> bool:
+        return path.startswith("http://") or path.startswith("https://")
+
+    async def load_rules(self, rule_paths: Iterable[Union[str, os.PathLike]]) -> None:
+        if self.processors:
+            raise RuntimeError("UrlCanonicalizer.load_rules() can only be called once.")
+
         rules: Set[UrlRule] = set()
         full_rule_count = 0
         for rule_path in rule_paths:
-            data = Path(rule_path).read_text()
+            data: str
+            if isinstance(rule_path, str) and self._is_url(rule_path):
+                response = await maybe_deferred_to_future(treq.get(rule_path))
+                data = await response.text()
+            else:
+                data = Path(rule_path).read_text()
             loaded_rules = load_rules(data)
             full_rule_count += len(loaded_rules)
             rules.update(loaded_rules)
@@ -26,8 +44,6 @@ def __init__(self, rule_paths: Iterable[Union[str, os.PathLike]]) -> None:
             f"Loaded {rule_count} rules, skipped {full_rule_count - rule_count} duplicates."
         )
 
-        self.url_matcher = URLMatcher()
-        self.processors: Dict[int, UrlProcessorBase] = {}
         rule_id = 0
         for rule in sorted(rules, key=operator.attrgetter("order")):
             processor = get_processor(rule)

diff --git a/pyproject.toml b/pyproject.toml
@@ -25,6 +25,7 @@ classifiers = [
 requires-python = ">=3.8"
 dependencies = [
     "Scrapy >= 2.7.0",
+    "treq >= 21.5.0",
     "url-matcher >= 0.5.0",
     "w3lib >= 1.22.0",
 ]
@@ -46,6 +47,7 @@ multi_line_output = 3
 [[tool.mypy.overrides]]
 module = [
     "scrapy.*",
+    "treq.*",
     "url_matcher.*",
 ]
 ignore_missing_imports = true

diff --git a/tests/test_fingerprinter.py b/tests/test_fingerprinter.py
@@ -2,16 +2,19 @@
 from pathlib import Path
 from typing import Any, Dict
 
+import pytest
 from scrapy import Request, Spider
 from scrapy.dupefilters import BaseDupeFilter, RFPDupeFilter
 from scrapy.utils.test import get_crawler
 
 from duplicate_url_discarder import Fingerprinter
 
 
-def get_fingerprinter(settings_dict: Dict[str, Any]) -> Fingerprinter:
+async def get_fingerprinter(settings_dict: Dict[str, Any]) -> Fingerprinter:
     crawler = get_crawler(Spider, settings_dict)
-    return Fingerprinter.from_crawler(crawler)
+    fp = Fingerprinter.from_crawler(crawler)
+    await fp.spider_opened(crawler.spider)
+    return fp
 
 
 def get_df(fingerprinter: Fingerprinter) -> BaseDupeFilter:
@@ -20,7 +23,8 @@ def get_df(fingerprinter: Fingerprinter) -> BaseDupeFilter:
     )
 
 
-def test_fingerprinter(tmp_path):
+@pytest.mark.asyncio
+async def test_fingerprinter(tmp_path):
     rules_path = Path(tmp_path) / "rules.json"
     rules_path.write_text(
         json.dumps(
@@ -40,7 +44,7 @@ def test_fingerprinter(tmp_path):
             ]
         )
     )
-    fingerprinter = get_fingerprinter({"DUD_LOAD_RULE_PATHS": [str(rules_path)]})
+    fingerprinter = await get_fingerprinter({"DUD_LOAD_RULE_PATHS": [str(rules_path)]})
     assert len(fingerprinter.url_canonicalizer.processors) == 2
 
     def get_stat(stat: str) -> Any:

diff --git a/tests/test_url_canonicalizer.py b/tests/test_url_canonicalizer.py
@@ -8,11 +8,12 @@
 
 
 def test_url_canonicalizer_empty():
-    url_canonicalizer = UrlCanonicalizer([])
+    url_canonicalizer = UrlCanonicalizer()
     assert url_canonicalizer.process_url("http://foo.example") == "http://foo.example"
 
 
-def test_url_canonicalizer_load(tmp_path):
+@pytest.mark.asyncio
+async def test_url_canonicalizer_load(tmp_path):
     empty_path = Path(tmp_path) / "empty.json"
     empty_path.write_text("[]")
     rules_path = Path(tmp_path) / "rules.json"
@@ -34,7 +35,8 @@ def test_url_canonicalizer_load(tmp_path):
             ]
         )
     )
-    url_canonicalizer = UrlCanonicalizer([str(empty_path), rules_path])
+    url_canonicalizer = UrlCanonicalizer()
+    await url_canonicalizer.load_rules([str(empty_path), rules_path])
     assert len(url_canonicalizer.processors) == 2
     assert (
         url_canonicalizer.process_url("http://foo.example/?foo=1&bbn=1&PHPSESSIONID=1")
@@ -46,7 +48,8 @@ def test_url_canonicalizer_load(tmp_path):
     )
 
 
-def test_url_canonicalizer_unknown_processor(tmp_path):
+@pytest.mark.asyncio
+async def test_url_canonicalizer_unknown_processor(tmp_path):
     rules_path = Path(tmp_path) / "rules.json"
     rules_path.write_text(
         json.dumps(
@@ -67,7 +70,7 @@ def test_url_canonicalizer_unknown_processor(tmp_path):
         )
     )
     with pytest.raises(ValueError, match="No URL processor named unknown"):
-        UrlCanonicalizer([rules_path])
+        await UrlCanonicalizer().load_rules([rules_path])
 
 
 @pytest.mark.parametrize(
@@ -78,7 +81,10 @@ def test_url_canonicalizer_unknown_processor(tmp_path):
         (2, 1),
     ],
 )
-def test_url_canonicalizer_multiple_rules_same_processor(tmp_path, order1, order2):
+@pytest.mark.asyncio
+async def test_url_canonicalizer_multiple_rules_same_processor(
+    tmp_path, order1, order2
+):
     rules_path = Path(tmp_path) / "rules.json"
     rules_path.write_text(
         json.dumps(
@@ -98,15 +104,17 @@ def test_url_canonicalizer_multiple_rules_same_processor(tmp_path, order1, order
             ]
         )
     )
-    url_canonicalizer = UrlCanonicalizer([rules_path])
+    url_canonicalizer = UrlCanonicalizer()
+    await url_canonicalizer.load_rules([rules_path])
     assert len(url_canonicalizer.processors) == 2
     assert (
         url_canonicalizer.process_url("https://example.com?utm_source=cat&bbn=1&ref=g")
         == "https://example.com"
     )
 
 
-def test_url_canonicalizer_duplicate_rules(tmp_path, caplog):
+@pytest.mark.asyncio
+async def test_url_canonicalizer_duplicate_rules(tmp_path, caplog):
     rules_path = Path(tmp_path) / "rules.json"
     rules_path.write_text(
         json.dumps(
@@ -138,7 +146,8 @@ def test_url_canonicalizer_duplicate_rules(tmp_path, caplog):
             ]
         )
     )
+    url_canonicalizer = UrlCanonicalizer()
     with caplog.at_level(logging.INFO):
-        url_canonicalizer = UrlCanonicalizer([rules_path])
+        await url_canonicalizer.load_rules([rules_path])
     assert len(url_canonicalizer.processors) == 3
     assert "Loaded 3 rules, skipped 1 duplicates." in caplog.text
diff --git a/tox.ini b/tox.ini
@@ -4,6 +4,7 @@ envlist = py,pre-commit,mypy,docs,twinecheck
 [testenv]
 deps =
     pytest
+    pytest-asyncio
     pytest-cov
 commands =
     py.test \
@@ -17,6 +18,7 @@ basepython = python3.8
 deps =
     {[testenv]deps}
     Scrapy==2.7.0
+    treq==21.5.0
     url-matcher==0.5.0
     w3lib==1.22.0