From a85aad631b29b92c5cabbd4197e413e909bdf070 Mon Sep 17 00:00:00 2001 From: jichao wang Date: Sun, 21 Jun 2026 17:33:38 +0100 Subject: [PATCH] Fix fast_urljoin mangling root-absolute paths on pages with a path fast_urljoin() handled a root-absolute href ("/path") by appending it to the full base URL (base + url), which is only correct when base has no path beyond the host. Per RFC 3986 a root-absolute reference replaces the base's entire path, so for a real crawled sub-page the join was wrong: fast_urljoin("https://example.com/docs/guide.html", "/api/reference") -> "https://example.com/docs/guide.html/api/reference" (broken) urllib.parse.urljoin(...) -> "https://example.com/api/reference" The function documents urljoin as its fallback, so this diverges from its own intended contract. DefaultMarkdownGenerator.convert_links_to_citations() joins every link with fast_urljoin(base_url, href), so every root-absolute link in a crawled page produced a broken citation/reference URL. Resolve root-absolute paths against base's scheme://authority root, keeping the fast path for the common case and deferring to urljoin for unusual bases. Adds tests/test_fast_urljoin.py asserting parity with urllib.parse.urljoin for root-absolute, already-correct, and relative cases, plus an end-to-end check through convert_links_to_citations. The root-absolute cases fail on the old code and pass with this fix. --- crawl4ai/markdown_generation_strategy.py | 18 +++++-- tests/test_fast_urljoin.py | 68 ++++++++++++++++++++++++ 2 files changed, 82 insertions(+), 4 deletions(-) create mode 100644 tests/test_fast_urljoin.py diff --git a/crawl4ai/markdown_generation_strategy.py b/crawl4ai/markdown_generation_strategy.py index 8275eb43c..c6e7feeed 100644 --- a/crawl4ai/markdown_generation_strategy.py +++ b/crawl4ai/markdown_generation_strategy.py @@ -16,10 +16,20 @@ def fast_urljoin(base: str, url: str) -> str: if url.startswith(("http://", "https://", "mailto:", "//")): return url if url.startswith("/"): - # Handle absolute paths - if base.endswith("/"): - return base[:-1] + url - return base + url + # Root-absolute path: per RFC 3986 it replaces base's *entire* path, + # so it must be resolved against base's scheme://authority root rather + # than appended to base (appending keeps base's own path and yields a + # broken URL such as ".../guide.html/api" instead of ".../api" when + # base points at a sub-page). Fall back to urljoin for bases without a + # clean authority boundary (e.g. a query/fragment but no path). + scheme_sep = base.find("://") + if scheme_sep != -1: + authority_end = base.find("/", scheme_sep + 3) + if authority_end != -1: + return base[:authority_end] + url + if "?" not in base and "#" not in base: + return base + url + return urljoin(base, url) return urljoin(base, url) diff --git a/tests/test_fast_urljoin.py b/tests/test_fast_urljoin.py new file mode 100644 index 000000000..50bc06006 --- /dev/null +++ b/tests/test_fast_urljoin.py @@ -0,0 +1,68 @@ +""" +Regression tests for ``fast_urljoin`` root-absolute path handling. + +``fast_urljoin`` previously appended a root-absolute href (``"/path"``) to the +full base URL instead of resolving it against base's ``scheme://authority`` +root, so a link like ``/api`` on a crawled sub-page +``https://site/docs/guide.html`` became ``https://site/docs/guide.html/api`` +instead of ``https://site/api``. These joins feed +``DefaultMarkdownGenerator.convert_links_to_citations``, so every root-absolute +link produced a broken citation/reference URL. + +``fast_urljoin`` documents ``urllib.parse.urljoin`` as its fallback, so its +output for these cases must match ``urljoin``. +""" +from urllib.parse import urljoin + +import pytest + +from crawl4ai.markdown_generation_strategy import ( + DefaultMarkdownGenerator, + fast_urljoin, +) + +# (base, href) pairs that must all agree with urllib.parse.urljoin. +URLJOIN_PARITY_CASES = [ + # root-absolute href on a base that carries a path (the bug) + ("https://example.com/docs/guide.html", "/api/reference"), + ("https://example.com/a/b/c", "/x"), + ("https://example.com:8080/docs/page", "/login"), + ("https://example.com/docs/guide.html", "/a/b/c?x=1#frag"), + ("https://sub.example.com/a/b?q=1#f", "/root"), + # root-absolute href on a base that is just scheme://authority (already ok) + ("https://example.com", "/api"), + ("https://example.com/", "/api"), + # non-root-absolute forms keep delegating to urljoin + ("https://example.com/docs/", "rel/page"), + ("https://example.com/docs/guide.html", "../up"), + ("https://example.com/docs/guide.html", "sibling.html"), +] + + +class TestFastUrljoinRootAbsolute: + @pytest.mark.parametrize("base, href", URLJOIN_PARITY_CASES) + def test_matches_urljoin(self, base, href): + assert fast_urljoin(base, href) == urljoin(base, href) + + def test_root_absolute_replaces_base_path(self): + # The specific bug: a root-absolute link on a sub-page must not inherit + # the sub-page's path. + assert ( + fast_urljoin("https://example.com/docs/guide.html", "/api/reference") + == "https://example.com/api/reference" + ) + + def test_absolute_and_special_schemes_unchanged(self): + # Pre-existing fast paths must be preserved. + assert fast_urljoin("https://example.com/p", "https://other.com/x") == "https://other.com/x" + assert fast_urljoin("https://example.com/p", "mailto:a@b.com") == "mailto:a@b.com" + + def test_citations_use_corrected_urls(self): + # End-to-end through the public markdown citation path. + gen = DefaultMarkdownGenerator() + md = "See the [API](/api/reference)." + _, references = gen.convert_links_to_citations( + md, base_url="https://example.com/docs/guide.html" + ) + assert "https://example.com/api/reference" in references + assert "guide.html/api" not in references