From 4fb4deff3680637d4e49da49ea996c42f646bb49 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Thu, 4 Jul 2024 08:22:53 -0500 Subject: [PATCH 1/6] Remove exception handler and improve logging Signed-off-by: Andrew Brain --- augur/tasks/github/events.py | 37 ++++++++++++++---------------------- 1 file changed, 14 insertions(+), 23 deletions(-) diff --git a/augur/tasks/github/events.py b/augur/tasks/github/events.py index 44bb7e19ae..cfb809a1e0 100644 --- a/augur/tasks/github/events.py +++ b/augur/tasks/github/events.py @@ -19,34 +19,29 @@ def collect_events(repo_git: str): logger = logging.getLogger(collect_events.__name__) - - try: - - repo_obj = get_repo_by_repo_git(repo_git) - repo_id = repo_obj.repo_id - owner, repo = get_owner_repo(repo_git) + repo_obj = get_repo_by_repo_git(repo_git) + repo_id = repo_obj.repo_id - logger.info(f"Collecting Github events for {owner}/{repo}") + owner, repo = get_owner_repo(repo_git) - key_auth = GithubRandomKeyAuth(logger) + logger.debug(f"Collecting Github events for {owner}/{repo}") - event_data = retrieve_all_event_data(repo_git, logger, key_auth) + key_auth = GithubRandomKeyAuth(logger) - if event_data: - process_events(event_data, f"{owner}/{repo}: Event task", repo_id, logger) - else: - logger.info(f"{owner}/{repo} has no events") + event_data = retrieve_all_event_data(repo_git, logger, key_auth) - except Exception as e: - logger.error(f"Could not collect events for {repo_git}\n Reason: {e} \n Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}") + if event_data: + process_events(event_data, f"{owner}/{repo}: Event task", repo_id, logger) + else: + logger.debug(f"{owner}/{repo} has no events") def retrieve_all_event_data(repo_git: str, logger, key_auth): owner, repo = get_owner_repo(repo_git) - logger.info(f"Collecting Github events for {owner}/{repo}") + logger.debug(f"Collecting Github events for {owner}/{repo}") url = f"https://api.github.com/repos/{owner}/{repo}/issues/events" @@ -54,7 +49,7 @@ def retrieve_all_event_data(repo_git: str, logger, key_auth): event_count = github_data_access.get_resource_page_count(url) - logger.info(f"{owner}/{repo}: Collecting {event_count} github events") + logger.debug(f"{owner}/{repo}: Collecting {event_count} github events") return list(github_data_access.paginate_resource(url)) @@ -104,9 +99,7 @@ def process_events(events, task_name, repo_id, logger): # query = augur_db.session.query(PullRequest).filter(PullRequest.pr_url == pr_url) # related_pr = execute_session_query(query, 'one') except KeyError: - logger.info(f"{task_name}: Could not find related pr") - logger.info(f"{task_name}: We were searching for: {pr_url}") - logger.info(f"{task_name}: Skipping") + logger.warning(f"{task_name}: Could not find related pr. We were searching for: {pr_url}") continue pr_event_dicts.append( @@ -122,9 +115,7 @@ def process_events(events, task_name, repo_id, logger): # query = augur_db.session.query(Issue).filter(Issue.issue_url == issue_url) # related_issue = execute_session_query(query, 'one') except KeyError: - logger.info(f"{task_name}: Could not find related pr") - logger.info(f"{task_name}: We were searching for: {issue_url}") - logger.info(f"{task_name}: Skipping") + logger.warning(f"{task_name}: Could not find related issue. We were searching for: {issue_url}") continue issue_event_dicts.append( From dcdeed3e0b4232b82abd5e2bc2cf3250db29a607 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Thu, 4 Jul 2024 14:38:29 -0500 Subject: [PATCH 2/6] Fully collect github events Signed-off-by: Andrew Brain --- augur/tasks/github/events.py | 66 +++++++++++++++---- augur/tasks/github/util/github_data_access.py | 8 ++- 2 files changed, 61 insertions(+), 13 deletions(-) diff --git a/augur/tasks/github/events.py b/augur/tasks/github/events.py index cfb809a1e0..4c12518c8a 100644 --- a/augur/tasks/github/events.py +++ b/augur/tasks/github/events.py @@ -1,6 +1,7 @@ import logging import traceback import sqlalchemy as s +from sqlalchemy.sql import text from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask @@ -9,8 +10,8 @@ from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth from augur.tasks.github.util.util import get_owner_repo from augur.tasks.util.worker_util import remove_duplicate_dicts -from augur.application.db.models import PullRequestEvent, IssueEvent, Contributor -from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts, get_issues_by_repo_id, get_pull_requests_by_repo_id, update_issue_closed_cntrbs_by_repo_id +from augur.application.db.models import PullRequestEvent, IssueEvent, Contributor, CollectionStatus +from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts, get_issues_by_repo_id, get_pull_requests_by_repo_id, update_issue_closed_cntrbs_by_repo_id, get_session, get_engine platform_id = 1 @@ -29,15 +30,34 @@ def collect_events(repo_git: str): key_auth = GithubRandomKeyAuth(logger) - event_data = retrieve_all_event_data(repo_git, logger, key_auth) - - if event_data: - process_events(event_data, f"{owner}/{repo}: Event task", repo_id, logger) + if bulk_events_collection_endpoint_contains_all_data(repo_id): + event_generator = bulk_collect_pr_and_issue_events(repo_git, logger, key_auth) else: - logger.debug(f"{owner}/{repo} has no events") + event_generator = collect_pr_and_issues_events_by_number(repo_id, repo_git, logger, key_auth, f"{owner}/{repo}: Event task") + + events = [] + for event in event_generator: + events.append(event) + + # making this a decent size since process_events retrieves all the issues and prs each time + if len(events) >= 500: + process_events(events, f"{owner}/{repo}: Event task", repo_id, logger) + events.clear() + + if events: + process_events(events, f"{owner}/{repo}: Event task", repo_id, logger) + + +def bulk_events_collection_endpoint_contains_all_data(key_auth, logger, owner, repo): + + url = f"https://api.github.com/repos/{owner}/{repo}/issues/events" + + github_data_access = GithubDataAccess(key_auth, logger) + + return github_data_access.does_pagination_contain_all_data(url) -def retrieve_all_event_data(repo_git: str, logger, key_auth): +def bulk_collect_pr_and_issue_events(repo_git: str, logger, key_auth): owner, repo = get_owner_repo(repo_git) @@ -47,11 +67,35 @@ def retrieve_all_event_data(repo_git: str, logger, key_auth): github_data_access = GithubDataAccess(key_auth, logger) - event_count = github_data_access.get_resource_page_count(url) + return github_data_access.paginate_resource(url) - logger.debug(f"{owner}/{repo}: Collecting {event_count} github events") - return list(github_data_access.paginate_resource(url)) +def collect_pr_and_issues_events_by_number(repo_id, repo_git: str, logger, key_auth, task_name) -> None: + + owner, repo = get_owner_repo(repo_git) + + # define logger for task + logger.debug(f"Collecting github events for {owner}/{repo}") + + engine = get_engine() + + with engine.connect() as connection: + + query = text(f""" + (select pr_src_number as number from pull_requests WHERE repo_id={repo_id} order by pr_created_at desc) + UNION + (select gh_issues_number as number from issues WHERE repo_id={repo_id} order by created_at desc); + """) + + result = connection.execute(query).fetchall() + numbers = [x[0] for x in result] + + github_data_access = GithubDataAccess(key_auth, logger) + for number in numbers: + + event_url = f"https://api.github.com/repos/{owner}/{repo}/issues/{number}/events" + + yield from github_data_access.paginate_resource(event_url) def process_events(events, task_name, repo_id, logger): diff --git a/augur/tasks/github/util/github_data_access.py b/augur/tasks/github/util/github_data_access.py index 2f4c988014..8b4b7419f5 100644 --- a/augur/tasks/github/util/github_data_access.py +++ b/augur/tasks/github/util/github_data_access.py @@ -65,11 +65,15 @@ def paginate_resource(self, url): return - def is_pagination_limited_by_max_github_pages(self, url): + def does_pagination_contain_all_data(self, url): page_count = self.get_resource_page_count(url) - return page_count <= 299 + if page_count > 300: + raise Exception(f"Either github raised the paginator page limit for things like events and messages, or + is_pagination_limited_by_max_github_pages is being used on a resource that does not have a page limit. Url: {url}") + + return page_count == 300 def get_resource_page_count(self, url): From f1907f902db2238f90d901b742557a5e2b018c86 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Thu, 4 Jul 2024 14:40:51 -0500 Subject: [PATCH 3/6] Add warning so we can detect if the last page link is not in headers a lot Signed-off-by: Andrew Brain --- augur/tasks/github/util/github_data_access.py | 1 + 1 file changed, 1 insertion(+) diff --git a/augur/tasks/github/util/github_data_access.py b/augur/tasks/github/util/github_data_access.py index 8b4b7419f5..d3b0e0cd87 100644 --- a/augur/tasks/github/util/github_data_access.py +++ b/augur/tasks/github/util/github_data_access.py @@ -80,6 +80,7 @@ def get_resource_page_count(self, url): response = self.make_request_with_retries(url, method="HEAD") if 'last' not in response.links.keys(): + self.logger.warning(f"Github response without links. Headers: {response.headers}.") return 1 try: From dce359d888ebba9783a21ad6deebfafcc5fcc60a Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 13 Jul 2024 09:45:28 -0500 Subject: [PATCH 4/6] Fix indentation Signed-off-by: Andrew Brain --- augur/tasks/github/events.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/tasks/github/events.py b/augur/tasks/github/events.py index 4c12518c8a..39689970be 100644 --- a/augur/tasks/github/events.py +++ b/augur/tasks/github/events.py @@ -31,7 +31,7 @@ def collect_events(repo_git: str): key_auth = GithubRandomKeyAuth(logger) if bulk_events_collection_endpoint_contains_all_data(repo_id): - event_generator = bulk_collect_pr_and_issue_events(repo_git, logger, key_auth) + event_generator = bulk_collect_pr_and_issue_events(repo_git, logger, key_auth) else: event_generator = collect_pr_and_issues_events_by_number(repo_id, repo_git, logger, key_auth, f"{owner}/{repo}: Event task") From 2d784a5c06f854e7f892d50ecf00b999295014c8 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Mon, 5 Aug 2024 17:56:42 -0500 Subject: [PATCH 5/6] Fix boolean logic Signed-off-by: Andrew Brain --- augur/tasks/github/events.py | 8 +++++++- augur/tasks/github/util/github_data_access.py | 10 ---------- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/augur/tasks/github/events.py b/augur/tasks/github/events.py index 39689970be..19da66cd89 100644 --- a/augur/tasks/github/events.py +++ b/augur/tasks/github/events.py @@ -54,7 +54,13 @@ def bulk_events_collection_endpoint_contains_all_data(key_auth, logger, owner, r github_data_access = GithubDataAccess(key_auth, logger) - return github_data_access.does_pagination_contain_all_data(url) + page_count = github_data_access.get_resource_page_count(url) + + if page_count > 300: + raise Exception(f"Either github raised the paginator page limit for things like events and messages, or + is_pagination_limited_by_max_github_pages is being used on a resource that does not have a page limit. Url: {url}") + + return page_count != 300 def bulk_collect_pr_and_issue_events(repo_git: str, logger, key_auth): diff --git a/augur/tasks/github/util/github_data_access.py b/augur/tasks/github/util/github_data_access.py index d3b0e0cd87..850336f53c 100644 --- a/augur/tasks/github/util/github_data_access.py +++ b/augur/tasks/github/util/github_data_access.py @@ -65,16 +65,6 @@ def paginate_resource(self, url): return - def does_pagination_contain_all_data(self, url): - - page_count = self.get_resource_page_count(url) - - if page_count > 300: - raise Exception(f"Either github raised the paginator page limit for things like events and messages, or - is_pagination_limited_by_max_github_pages is being used on a resource that does not have a page limit. Url: {url}") - - return page_count == 300 - def get_resource_page_count(self, url): response = self.make_request_with_retries(url, method="HEAD") From 058602476328d275e255f15cf28e6fb30d980982 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Mon, 5 Aug 2024 18:00:01 -0500 Subject: [PATCH 6/6] Fix string Signed-off-by: Andrew Brain --- augur/tasks/github/events.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/augur/tasks/github/events.py b/augur/tasks/github/events.py index 19da66cd89..db904daa39 100644 --- a/augur/tasks/github/events.py +++ b/augur/tasks/github/events.py @@ -57,8 +57,7 @@ def bulk_events_collection_endpoint_contains_all_data(key_auth, logger, owner, r page_count = github_data_access.get_resource_page_count(url) if page_count > 300: - raise Exception(f"Either github raised the paginator page limit for things like events and messages, or - is_pagination_limited_by_max_github_pages is being used on a resource that does not have a page limit. Url: {url}") + raise Exception(f"Either github raised the paginator page limit for things like events and messages, or is_pagination_limited_by_max_github_pages is being used on a resource that does not have a page limit. Url: {url}") return page_count != 300