From a85aad631b29b92c5cabbd4197e413e909bdf070 Mon Sep 17 00:00:00 2001
From: jichao wang <jichaowang02@gmail.com>
Date: Sun, 21 Jun 2026 17:33:38 +0100
Subject: [PATCH] Fix fast_urljoin mangling root-absolute paths on pages with a
 path

fast_urljoin() handled a root-absolute href ("/path") by appending it to
the full base URL (base + url), which is only correct when base has no path
beyond the host. Per RFC 3986 a root-absolute reference replaces the base's
entire path, so for a real crawled sub-page the join was wrong:

    fast_urljoin("https://example.com/docs/guide.html", "/api/reference")
      ->  "https://example.com/docs/guide.html/api/reference"   (broken)
    urllib.parse.urljoin(...)  ->  "https://example.com/api/reference"

The function documents urljoin as its fallback, so this diverges from its own
intended contract. DefaultMarkdownGenerator.convert_links_to_citations() joins
every link with fast_urljoin(base_url, href), so every root-absolute link in a
crawled page produced a broken citation/reference URL.

Resolve root-absolute paths against base's scheme://authority root, keeping
the fast path for the common case and deferring to urljoin for unusual bases.

Adds tests/test_fast_urljoin.py asserting parity with urllib.parse.urljoin for
root-absolute, already-correct, and relative cases, plus an end-to-end check
through convert_links_to_citations. The root-absolute cases fail on the old
code and pass with this fix.
---
 crawl4ai/markdown_generation_strategy.py | 18 +++++--
 tests/test_fast_urljoin.py               | 68 ++++++++++++++++++++++++
 2 files changed, 82 insertions(+), 4 deletions(-)
 create mode 100644 tests/test_fast_urljoin.py

diff --git a/crawl4ai/markdown_generation_strategy.py b/crawl4ai/markdown_generation_strategy.py
index 8275eb43c..c6e7feeed 100644
--- a/crawl4ai/markdown_generation_strategy.py
+++ b/crawl4ai/markdown_generation_strategy.py
@@ -16,10 +16,20 @@ def fast_urljoin(base: str, url: str) -> str:
     if url.startswith(("http://", "https://", "mailto:", "//")):
         return url
     if url.startswith("/"):
-        # Handle absolute paths
-        if base.endswith("/"):
-            return base[:-1] + url
-        return base + url
+        # Root-absolute path: per RFC 3986 it replaces base's *entire* path,
+        # so it must be resolved against base's scheme://authority root rather
+        # than appended to base (appending keeps base's own path and yields a
+        # broken URL such as ".../guide.html/api" instead of ".../api" when
+        # base points at a sub-page). Fall back to urljoin for bases without a
+        # clean authority boundary (e.g. a query/fragment but no path).
+        scheme_sep = base.find("://")
+        if scheme_sep != -1:
+            authority_end = base.find("/", scheme_sep + 3)
+            if authority_end != -1:
+                return base[:authority_end] + url
+            if "?" not in base and "#" not in base:
+                return base + url
+        return urljoin(base, url)
     return urljoin(base, url)
 
 
diff --git a/tests/test_fast_urljoin.py b/tests/test_fast_urljoin.py
new file mode 100644
index 000000000..50bc06006
--- /dev/null
+++ b/tests/test_fast_urljoin.py
@@ -0,0 +1,68 @@
+"""
+Regression tests for ``fast_urljoin`` root-absolute path handling.
+
+``fast_urljoin`` previously appended a root-absolute href (``"/path"``) to the
+full base URL instead of resolving it against base's ``scheme://authority``
+root, so a link like ``/api`` on a crawled sub-page
+``https://site/docs/guide.html`` became ``https://site/docs/guide.html/api``
+instead of ``https://site/api``. These joins feed
+``DefaultMarkdownGenerator.convert_links_to_citations``, so every root-absolute
+link produced a broken citation/reference URL.
+
+``fast_urljoin`` documents ``urllib.parse.urljoin`` as its fallback, so its
+output for these cases must match ``urljoin``.
+"""
+from urllib.parse import urljoin
+
+import pytest
+
+from crawl4ai.markdown_generation_strategy import (
+    DefaultMarkdownGenerator,
+    fast_urljoin,
+)
+
+# (base, href) pairs that must all agree with urllib.parse.urljoin.
+URLJOIN_PARITY_CASES = [
+    # root-absolute href on a base that carries a path (the bug)
+    ("https://example.com/docs/guide.html", "/api/reference"),
+    ("https://example.com/a/b/c", "/x"),
+    ("https://example.com:8080/docs/page", "/login"),
+    ("https://example.com/docs/guide.html", "/a/b/c?x=1#frag"),
+    ("https://sub.example.com/a/b?q=1#f", "/root"),
+    # root-absolute href on a base that is just scheme://authority (already ok)
+    ("https://example.com", "/api"),
+    ("https://example.com/", "/api"),
+    # non-root-absolute forms keep delegating to urljoin
+    ("https://example.com/docs/", "rel/page"),
+    ("https://example.com/docs/guide.html", "../up"),
+    ("https://example.com/docs/guide.html", "sibling.html"),
+]
+
+
+class TestFastUrljoinRootAbsolute:
+    @pytest.mark.parametrize("base, href", URLJOIN_PARITY_CASES)
+    def test_matches_urljoin(self, base, href):
+        assert fast_urljoin(base, href) == urljoin(base, href)
+
+    def test_root_absolute_replaces_base_path(self):
+        # The specific bug: a root-absolute link on a sub-page must not inherit
+        # the sub-page's path.
+        assert (
+            fast_urljoin("https://example.com/docs/guide.html", "/api/reference")
+            == "https://example.com/api/reference"
+        )
+
+    def test_absolute_and_special_schemes_unchanged(self):
+        # Pre-existing fast paths must be preserved.
+        assert fast_urljoin("https://example.com/p", "https://other.com/x") == "https://other.com/x"
+        assert fast_urljoin("https://example.com/p", "mailto:a@b.com") == "mailto:a@b.com"
+
+    def test_citations_use_corrected_urls(self):
+        # End-to-end through the public markdown citation path.
+        gen = DefaultMarkdownGenerator()
+        md = "See the [API](/api/reference)."
+        _, references = gen.convert_links_to_citations(
+            md, base_url="https://example.com/docs/guide.html"
+        )
+        assert "https://example.com/api/reference" in references
+        assert "guide.html/api" not in references