diff --git a/scripts/stubsabot.py b/scripts/stubsabot.py index f4ff3dd20..6ceafd387 100644 --- a/scripts/stubsabot.py +++ b/scripts/stubsabot.py @@ -255,16 +255,18 @@ async def get_github_repo_info(session: aiohttp.ClientSession, stub_info: StubIn Else, return None. """ if stub_info.upstream_repository: + # We have various sanity checks for the upstream_repository field in tests/parse_metadata.py, + # so no need to repeat all of them here split_url = urllib.parse.urlsplit(stub_info.upstream_repository) - if split_url.netloc == "github.com" and not split_url.query and not split_url.fragment: + if split_url.netloc == "github.com": url_path = split_url.path.strip("/") - if len(Path(url_path).parts) == 2: - github_tags_info_url = f"https://api.github.com/repos/{url_path}/tags" - async with session.get(github_tags_info_url, headers=get_github_api_headers()) as response: - if response.status == 200: - tags: list[dict[str, Any]] = await response.json() - assert isinstance(tags, list) - return GithubInfo(repo_path=url_path, tags=tags) + assert len(Path(url_path).parts) == 2 + github_tags_info_url = f"https://api.github.com/repos/{url_path}/tags" + async with session.get(github_tags_info_url, headers=get_github_api_headers()) as response: + if response.status == 200: + tags: list[dict[str, Any]] = await response.json() + assert isinstance(tags, list) + return GithubInfo(repo_path=url_path, tags=tags) return None diff --git a/tests/parse_metadata.py b/tests/parse_metadata.py index 6097049ea..483eb034c 100644 --- a/tests/parse_metadata.py +++ b/tests/parse_metadata.py @@ -6,6 +6,7 @@ from __future__ import annotations import os import re +import urllib.parse from collections.abc import Mapping from dataclasses import dataclass from pathlib import Path @@ -199,6 +200,21 @@ def read_metadata(distribution: str) -> StubMetadata: upstream_repository: object = data.get("upstream_repository") assert isinstance(upstream_repository, (str, type(None))) + if isinstance(upstream_repository, str): + parsed_url = urllib.parse.urlsplit(upstream_repository) + assert parsed_url.scheme == "https", "URLs in the upstream_repository field should use https" + assert not parsed_url.netloc.startswith("www."), "`www.` should be removed from URLs in the upstream_repository field" + assert not parsed_url.query + assert not parsed_url.fragment + if parsed_url.netloc == "github.com": + cleaned_url_path = parsed_url.path.strip("/") + num_url_path_parts = len(Path(cleaned_url_path).parts) + bad_github_url_msg = ( + f"Invalid upstream_repository for {distribution!r}: " + "URLs for GitHub repositories always have two parts in their paths" + ) + assert num_url_path_parts == 2, bad_github_url_msg + obsolete_since: object = data.get("obsolete_since") assert isinstance(obsolete_since, (str, type(None))) no_longer_updated: object = data.get("no_longer_updated", False)