From e4c15b030ce7aa3a827e95fac98a02ef3bb958d7 Mon Sep 17 00:00:00 2001 From: ziad hany Date: Tue, 14 Oct 2025 17:40:15 +0300 Subject: [PATCH] Add support for collecting GitHub vulnerability-related issues and pull requests Add tests for this functionality Signed-off-by: ziad hany --- vulnerabilities/importers/__init__.py | 2 + .../pipelines/v2_importers/github_issue_pr.py | 92 +++++++++++++++++++ .../v2_importers/test_github_issue_pr.py | 80 ++++++++++++++++ .../expected_advisory_output.json | 64 +++++++++++++ .../github_issue_pr/issues_and_pr.json | 24 +++++ 5 files changed, 262 insertions(+) create mode 100644 vulnerabilities/pipelines/v2_importers/github_issue_pr.py create mode 100644 vulnerabilities/tests/pipelines/v2_importers/test_github_issue_pr.py create mode 100644 vulnerabilities/tests/test_data/github_issue_pr/expected_advisory_output.json create mode 100644 vulnerabilities/tests/test_data/github_issue_pr/issues_and_pr.json diff --git a/vulnerabilities/importers/__init__.py b/vulnerabilities/importers/__init__.py index 82ee4525a..a7d6625d2 100644 --- a/vulnerabilities/importers/__init__.py +++ b/vulnerabilities/importers/__init__.py @@ -47,6 +47,7 @@ from vulnerabilities.pipelines.v2_importers import ( elixir_security_importer as elixir_security_importer_v2, ) +from vulnerabilities.pipelines.v2_importers import github_issue_pr as github_issue_pr_v2 from vulnerabilities.pipelines.v2_importers import github_osv_importer as github_osv_importer_v2 from vulnerabilities.pipelines.v2_importers import gitlab_importer as gitlab_importer_v2 from vulnerabilities.pipelines.v2_importers import istio_importer as istio_importer_v2 @@ -115,5 +116,6 @@ ubuntu_usn.UbuntuUSNImporter, fireeye.FireyeImporter, oss_fuzz.OSSFuzzImporter, + github_issue_pr_v2.GithubPipelineIssuePR, ] ) diff --git a/vulnerabilities/pipelines/v2_importers/github_issue_pr.py b/vulnerabilities/pipelines/v2_importers/github_issue_pr.py new file mode 100644 index 000000000..ec33e925e --- /dev/null +++ b/vulnerabilities/pipelines/v2_importers/github_issue_pr.py @@ -0,0 +1,92 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# VulnerableCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/vulnerablecode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import re +from collections import defaultdict + +from github import Github + +from vulnerabilities.importer import AdvisoryData +from vulnerabilities.importer import ReferenceV2 +from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipelineV2 +from vulnerablecode.settings import env + +GITHUB_TOKEN = env.str("GITHUB_TOKEN") + + +class GithubPipelineIssuePR(VulnerableCodeBaseImporterPipelineV2): + """ + Pipeline to collect GitHub issues and PRs related to vulnerabilities. + """ + + pipeline_id = "collect_issues_pr" + + @classmethod + def steps(cls): + return ( + cls.fetch_entries, + cls.collect_and_store_advisories, + ) + + def fetch_entries(self): + """Clone the repository.""" + self.repo_url = "https://github.com/torvalds/linux" + repo_name = "django/django" + + g = Github(login_or_token=GITHUB_TOKEN) + + base_query = f"repo:{repo_name} (CVE OR PYSEC OR GHSA)" + self.issues = g.search_issues(f"{base_query} is:issue") + self.pull_requestes = g.search_issues(f"{base_query} is:pr") + + def advisories_count(self) -> int: + """ + Return total number of advisories discovered (issues + PRs). + """ + return self.issues.totalCount + self.pull_requestes.totalCount + + def collect_issues_and_prs(self): + """ + Group issues and PRs by vulnerability identifiers (like CVE-xxxx-yyyy). + Returns a dict mapping vuln_id -> [(type, html_url)]. + """ + self.log("Grouping GitHub issues and PRs by vulnerability identifiers.") + + grouped_items = defaultdict(list) + pattern = re.compile(r"(CVE-\d{4}-\d+|PYSEC-\d{4}-\d+|GHSA-[\w-]+)", re.IGNORECASE) + + for issue in self.issues: + matches = pattern.findall(issue.title + " " + (issue.body or "")) + for match in matches: + grouped_items[match].append(("Issue", issue.html_url)) + + for pr in self.pull_requestes: + matches = pattern.findall(pr.title + " " + (pr.body or "")) + for match in matches: + grouped_items[match].append(("PR", pr.html_url)) + + self.log(f"Grouped {len(grouped_items)} unique vulnerability identifiers.") + return grouped_items + + def collect_advisories(self): + """ + Generate AdvisoryData objects for each vulnerability ID grouped with its related GitHub issues and PRs. + """ + self.log("Generating AdvisoryData objects from GitHub issues and PRs.") + grouped_data = self.collect_issues_and_prs() + + for vuln_id, refs in grouped_data.items(): + references = [ReferenceV2(reference_id=ref_id, url=url) for ref_id, url in refs] + + yield AdvisoryData( + advisory_id=vuln_id, + aliases=[vuln_id], + references_v2=references, + url=self.repo_url, + ) diff --git a/vulnerabilities/tests/pipelines/v2_importers/test_github_issue_pr.py b/vulnerabilities/tests/pipelines/v2_importers/test_github_issue_pr.py new file mode 100644 index 000000000..e2b80f00f --- /dev/null +++ b/vulnerabilities/tests/pipelines/v2_importers/test_github_issue_pr.py @@ -0,0 +1,80 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# VulnerableCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/vulnerablecode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json +from pathlib import Path +from types import SimpleNamespace +from unittest.mock import MagicMock + +import pytest + +from vulnerabilities.pipelines.v2_importers.github_issue_pr import GithubPipelineIssuePR +from vulnerabilities.tests import util_tests + + +@pytest.fixture +def pipeline(): + pipeline = GithubPipelineIssuePR() + pipeline.repo_url = "https://github.com/test/repo" + pipeline.log = MagicMock() + return pipeline + + +@pytest.mark.django_db +def test_collect_issues_and_prs(pipeline): + pipeline.issues = [ + SimpleNamespace( + title="Fix for CVE-2023-1234 found", + body="This resolves a security issue", + html_url="http://example.com/issue1", + ), + SimpleNamespace( + title="No vulnerability mentioned", + body="This is unrelated", + html_url="http://example.com/issue2", + ), + ] + + pipeline.pull_requestes = [ + SimpleNamespace( + title="Patch addressing GHSA-zzz-111", + body="Also fixes PYSEC-2024-5678", + html_url="http://example.com/pr1", + ) + ] + + result = pipeline.collect_issues_and_prs() + expected = { + "CVE-2023-1234": [("Issue", "http://example.com/issue1")], + "GHSA-zzz-111": [("PR", "http://example.com/pr1")], + "PYSEC-2024-5678": [("PR", "http://example.com/pr1")], + } + + assert result == expected + + +TEST_DATA = Path(__file__).parent.parent.parent / "test_data" / "github_issue_pr" + + +@pytest.mark.django_db +def test_collect_advisories_from_json(): + input_file = TEST_DATA / "issues_and_pr.json" + expected_file = TEST_DATA / "expected_advisory_output.json" + + issues_and_prs = json.loads(input_file.read_text(encoding="utf-8")) + + pipeline = GithubPipelineIssuePR() + pipeline.repo_url = "https://github.com/test/repo" + pipeline.log = MagicMock() + + pipeline.collect_issues_and_prs = MagicMock(return_value=issues_and_prs) + + result = [adv.to_dict() for adv in pipeline.collect_advisories()] + + util_tests.check_results_against_json(result, expected_file) diff --git a/vulnerabilities/tests/test_data/github_issue_pr/expected_advisory_output.json b/vulnerabilities/tests/test_data/github_issue_pr/expected_advisory_output.json new file mode 100644 index 000000000..3ac486d96 --- /dev/null +++ b/vulnerabilities/tests/test_data/github_issue_pr/expected_advisory_output.json @@ -0,0 +1,64 @@ +[ + { + "advisory_id": "CVE-2023-1234", + "aliases": [ + "CVE-2023-1234" + ], + "summary": "", + "affected_packages": [], + "references_v2": [ + { + "reference_id": "Issue", + "reference_type": "", + "url": "https://example.com/issue1" + }, + { + "reference_id": "PR", + "reference_type": "", + "url": "https://example.com/pr1" + } + ], + "severities": [], + "date_published": null, + "weaknesses": [], + "url": "https://github.com/test/repo" + }, + { + "advisory_id": "GHSA-zzz-111", + "aliases": [ + "GHSA-zzz-111" + ], + "summary": "", + "affected_packages": [], + "references_v2": [ + { + "reference_id": "PR", + "reference_type": "", + "url": "https://example.com/pr1" + } + ], + "severities": [], + "date_published": null, + "weaknesses": [], + "url": "https://github.com/test/repo" + }, + { + "advisory_id": "PYSEC-2024-5678", + "aliases": [ + "PYSEC-2024-5678" + ], + "summary": "", + "affected_packages": [], + "references_v2": [ + { + "reference_id": "PR", + "reference_type": "", + "url": "https://example.com/pr1" + } + ], + "severities": [], + "date_published": null, + "weaknesses": [], + "url": "https://github.com/test/repo" + } +] \ No newline at end of file diff --git a/vulnerabilities/tests/test_data/github_issue_pr/issues_and_pr.json b/vulnerabilities/tests/test_data/github_issue_pr/issues_and_pr.json new file mode 100644 index 000000000..2f68eab98 --- /dev/null +++ b/vulnerabilities/tests/test_data/github_issue_pr/issues_and_pr.json @@ -0,0 +1,24 @@ +{ + "CVE-2023-1234": [ + [ + "Issue", + "https://example.com/issue1" + ], + [ + "PR", + "https://example.com/pr1" + ] + ], + "GHSA-zzz-111": [ + [ + "PR", + "https://example.com/pr1" + ] + ], + "PYSEC-2024-5678": [ + [ + "PR", + "https://example.com/pr1" + ] + ] +} \ No newline at end of file