Source code for ramble.test.util.web

# Copyright 2022-2026 The Ramble Authors
#
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
# option. This file may not be copied, modified, or distributed
# except according to those terms.
"""Perform tests of the util/web functions"""

import os
from io import BytesIO
from urllib.error import URLError

import pytest

from ramble.util import web

from spack.util import url as url_util


[docs] def test_get_header(): headers = {"Content-type": "text/plain"} assert web.get_header(headers, "Content-type") == "text/plain" # test fuzzy lookup assert web.get_header(headers, "contentType") == "text/plain" headers["contentType"] = "text/html" assert web.get_header(headers, "contentType") == "text/html" # test no match with pytest.raises(KeyError): web.get_header(headers, "ContentLength")
[docs] def test_gcs_url_exists(monkeypatch): def _get_client(): return MockGcsClient() import spack.util.gcs monkeypatch.setattr(spack.util.gcs, "gcs_client", _get_client) test_url = "gs://abc/xyz.txt" with pytest.raises(MockGcsClientError, match="Mock error for bucket abc"): web.url_exists(test_url)
[docs] class MockGcsClient:
[docs] def bucket(self, name): raise MockGcsClientError(f"Mock error for bucket {name}")
[docs] class MockGcsClientError(Exception): pass
[docs] def test_uses_ssl(monkeypatch): # Test https url = url_util.parse("https://example.com") assert web.uses_ssl(url) # Test http url = url_util.parse("http://example.com") assert not web.uses_ssl(url) # Test s3 without S3_ENDPOINT_URL url = url_util.parse("s3://my-bucket/key") assert web.uses_ssl(url) # Test s3 with http S3_ENDPOINT_URL monkeypatch.setenv("S3_ENDPOINT_URL", "http://s3.local") url = url_util.parse("s3://my-bucket/key") assert not web.uses_ssl(url) # Test s3 with https S3_ENDPOINT_URL monkeypatch.setenv("S3_ENDPOINT_URL", "https://s3.local") url = url_util.parse("s3://my-bucket/key") assert web.uses_ssl(url) monkeypatch.delenv("S3_ENDPOINT_URL") # Test gs url = url_util.parse("gs://my-bucket/key") assert web.uses_ssl(url) # Test file url = url_util.parse("file:///path/to/file") assert not web.uses_ssl(url)
[docs] def test_url_exists_file(tmpdir): # Test existing file p = tmpdir.join("exists.txt") p.write("content") assert web.url_exists(f"file://{str(p)}") # Test non-existing file assert not web.url_exists(f"file://{str(p)}/nonexistent.txt")
[docs] def test_push_to_url_file(tmpdir): local_file = tmpdir.join("local.txt") local_file.write("some data") remote_dir = tmpdir.mkdir("remote") remote_file_path = remote_dir.join("remote.txt") # Test copy web.push_to_url(str(local_file), f"file://{str(remote_file_path)}", keep_original=True) assert local_file.exists() assert remote_file_path.exists() assert remote_file_path.read() == "some data" remote_file_path.remove() # Test move web.push_to_url(str(local_file), f"file://{str(remote_file_path)}", keep_original=False) assert not local_file.exists() assert remote_file_path.exists() assert remote_file_path.read() == "some data"
[docs] def test_remove_url_file(tmpdir): # Test remove file p = tmpdir.join("file.txt") p.write("content") web.remove_url(f"file://{str(p)}") assert not p.exists() # Test remove directory recursively d = tmpdir.mkdir("dir") f = d.join("file.txt") f.write("content") web.remove_url(f"file://{str(d)}", recursive=True) assert not d.exists()
[docs] def test_list_url_file(tmpdir): d = tmpdir.mkdir("dir") f1 = d.join("file1.txt") f1.write("content") f2 = d.join("file2.txt") f2.write("content") sub = d.mkdir("subdir") f3 = sub.join("file3.txt") f3.write("content") # Test non-recursive file_list = web.list_url(f"file://{str(d)}") assert sorted(file_list) == ["file1.txt", "file2.txt"] # Test recursive file_list = web.list_url(f"file://{str(d)}", recursive=True) assert sorted(file_list) == ["file1.txt", "file2.txt", os.path.join("subdir", "file3.txt")]
[docs] class MockUrlOpenResponse: def __init__(self, url, content, headers): self._url = url self.content_stream = BytesIO(content) self.headers = headers
[docs] def geturl(self): return self._url
[docs] def read(self, *args): return self.content_stream.read(*args)
def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): pass
[docs] @pytest.mark.parametrize( "pages_data, start_url, depth, expected_pages, expected_links", [ ( { "https://example.com/page1": { "content": b'<html><body><a href="/page2">Page 2</a></body></html>' } }, "https://example.com/page1", 0, {"https://example.com/page1": '<html><body><a href="/page2">Page 2</a></body></html>'}, {"https://example.com/page2"}, ), ( { "https://example.com/page1": { "content": b'<html><body><a href="page2.html">Page 2</a></body></html>', }, "https://example.com/page2.html": { "content": b"<html><body>No links here</body></html>", }, }, "https://example.com/page1", 1, { "https://example.com/page1": '<html><body><a href="page2.html">Page 2</a></body></html>', "https://example.com/page2.html": "<html><body>No links here</body></html>", }, {"https://example.com/page2.html"}, ), ( { "https://example.com/pageA": { "content": b'<html><body><a href="pageB.html">Page B</a></body></html>', }, "https://example.com/pageB.html": { "content": b'<html><body><a href="/pageA">Page A</a></body></html>', }, }, "https://example.com/pageA", 2, { "https://example.com/pageA": '<html><body><a href="pageB.html">Page B</a></body></html>', "https://example.com/pageB.html": '<html><body><a href="/pageA">Page A</a></body></html>', }, {"https://example.com/pageA", "https://example.com/pageB.html"}, ), ], ) def test_spider(monkeypatch, pages_data, start_url, depth, expected_pages, expected_links): """Centralize the mocked urlopen for spider tests to reduce duplication.""" def mock_urlopen(req, *args, **kwargs): url = req.get_full_url() method = req.get_method() if url in pages_data: headers = {"Content-type": "text/html"} if method == "HEAD": return MockUrlOpenResponse(url, b"", headers) elif method == "GET": return MockUrlOpenResponse(url, pages_data[url]["content"], headers) raise URLError(f"URL not found: {url}") monkeypatch.setattr(web, "_urlopen", mock_urlopen) pages, links = web.spider(start_url, depth=depth) assert pages == expected_pages assert links == expected_links