Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 14 additions & 4 deletions crawl4ai/markdown_generation_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,20 @@ def fast_urljoin(base: str, url: str) -> str:
if url.startswith(("http://", "https://", "mailto:", "//")):
return url
if url.startswith("/"):
# Handle absolute paths
if base.endswith("/"):
return base[:-1] + url
return base + url
# Root-absolute path: per RFC 3986 it replaces base's *entire* path,
# so it must be resolved against base's scheme://authority root rather
# than appended to base (appending keeps base's own path and yields a
# broken URL such as ".../guide.html/api" instead of ".../api" when
# base points at a sub-page). Fall back to urljoin for bases without a
# clean authority boundary (e.g. a query/fragment but no path).
scheme_sep = base.find("://")
if scheme_sep != -1:
authority_end = base.find("/", scheme_sep + 3)
if authority_end != -1:
return base[:authority_end] + url
if "?" not in base and "#" not in base:
return base + url
return urljoin(base, url)
return urljoin(base, url)


Expand Down
68 changes: 68 additions & 0 deletions tests/test_fast_urljoin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
"""
Regression tests for ``fast_urljoin`` root-absolute path handling.

``fast_urljoin`` previously appended a root-absolute href (``"/path"``) to the
full base URL instead of resolving it against base's ``scheme://authority``
root, so a link like ``/api`` on a crawled sub-page
``https://site/docs/guide.html`` became ``https://site/docs/guide.html/api``
instead of ``https://site/api``. These joins feed
``DefaultMarkdownGenerator.convert_links_to_citations``, so every root-absolute
link produced a broken citation/reference URL.

``fast_urljoin`` documents ``urllib.parse.urljoin`` as its fallback, so its
output for these cases must match ``urljoin``.
"""
from urllib.parse import urljoin

import pytest

from crawl4ai.markdown_generation_strategy import (
DefaultMarkdownGenerator,
fast_urljoin,
)

# (base, href) pairs that must all agree with urllib.parse.urljoin.
URLJOIN_PARITY_CASES = [
# root-absolute href on a base that carries a path (the bug)
("https://example.com/docs/guide.html", "/api/reference"),
("https://example.com/a/b/c", "/x"),
("https://example.com:8080/docs/page", "/login"),
("https://example.com/docs/guide.html", "/a/b/c?x=1#frag"),
("https://sub.example.com/a/b?q=1#f", "/root"),
# root-absolute href on a base that is just scheme://authority (already ok)
("https://example.com", "/api"),
("https://example.com/", "/api"),
# non-root-absolute forms keep delegating to urljoin
("https://example.com/docs/", "rel/page"),
("https://example.com/docs/guide.html", "../up"),
("https://example.com/docs/guide.html", "sibling.html"),
]


class TestFastUrljoinRootAbsolute:
@pytest.mark.parametrize("base, href", URLJOIN_PARITY_CASES)
def test_matches_urljoin(self, base, href):
assert fast_urljoin(base, href) == urljoin(base, href)

def test_root_absolute_replaces_base_path(self):
# The specific bug: a root-absolute link on a sub-page must not inherit
# the sub-page's path.
assert (
fast_urljoin("https://example.com/docs/guide.html", "/api/reference")
== "https://example.com/api/reference"
)

def test_absolute_and_special_schemes_unchanged(self):
# Pre-existing fast paths must be preserved.
assert fast_urljoin("https://example.com/p", "https://other.com/x") == "https://other.com/x"
assert fast_urljoin("https://example.com/p", "mailto:a@b.com") == "mailto:a@b.com"

def test_citations_use_corrected_urls(self):
# End-to-end through the public markdown citation path.
gen = DefaultMarkdownGenerator()
md = "See the [API](/api/reference)."
_, references = gen.convert_links_to_citations(
md, base_url="https://example.com/docs/guide.html"
)
assert "https://example.com/api/reference" in references
assert "guide.html/api" not in references