Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Full collect events #2880

Merged
merged 6 commits into from
Aug 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 68 additions & 28 deletions augur/tasks/github/events.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
import traceback
sgoggins marked this conversation as resolved.
Show resolved Hide resolved
import sqlalchemy as s
sgoggins marked this conversation as resolved.
Show resolved Hide resolved
from sqlalchemy.sql import text

from augur.tasks.init.celery_app import celery_app as celery
from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask
Expand All @@ -9,8 +10,8 @@
from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth
from augur.tasks.github.util.util import get_owner_repo
from augur.tasks.util.worker_util import remove_duplicate_dicts
from augur.application.db.models import PullRequestEvent, IssueEvent, Contributor
from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts, get_issues_by_repo_id, get_pull_requests_by_repo_id, update_issue_closed_cntrbs_by_repo_id
from augur.application.db.models import PullRequestEvent, IssueEvent, Contributor, CollectionStatus
sgoggins marked this conversation as resolved.
Show resolved Hide resolved
from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts, get_issues_by_repo_id, get_pull_requests_by_repo_id, update_issue_closed_cntrbs_by_repo_id, get_session, get_engine
sgoggins marked this conversation as resolved.
Show resolved Hide resolved


platform_id = 1
Expand All @@ -19,44 +20,87 @@
def collect_events(repo_git: str):

logger = logging.getLogger(collect_events.__name__)

try:

repo_obj = get_repo_by_repo_git(repo_git)
repo_id = repo_obj.repo_id

owner, repo = get_owner_repo(repo_git)
repo_obj = get_repo_by_repo_git(repo_git)
repo_id = repo_obj.repo_id

logger.info(f"Collecting Github events for {owner}/{repo}")
owner, repo = get_owner_repo(repo_git)

key_auth = GithubRandomKeyAuth(logger)
logger.debug(f"Collecting Github events for {owner}/{repo}")

event_data = retrieve_all_event_data(repo_git, logger, key_auth)
key_auth = GithubRandomKeyAuth(logger)

if event_data:
process_events(event_data, f"{owner}/{repo}: Event task", repo_id, logger)
else:
logger.info(f"{owner}/{repo} has no events")
if bulk_events_collection_endpoint_contains_all_data(repo_id):
sgoggins marked this conversation as resolved.
Show resolved Hide resolved
sgoggins marked this conversation as resolved.
Show resolved Hide resolved
sgoggins marked this conversation as resolved.
Show resolved Hide resolved
event_generator = bulk_collect_pr_and_issue_events(repo_git, logger, key_auth)
else:
event_generator = collect_pr_and_issues_events_by_number(repo_id, repo_git, logger, key_auth, f"{owner}/{repo}: Event task")

events = []
for event in event_generator:
events.append(event)

# making this a decent size since process_events retrieves all the issues and prs each time
if len(events) >= 500:
process_events(events, f"{owner}/{repo}: Event task", repo_id, logger)
events.clear()

if events:
process_events(events, f"{owner}/{repo}: Event task", repo_id, logger)


def bulk_events_collection_endpoint_contains_all_data(key_auth, logger, owner, repo):

url = f"https://api.github.com/repos/{owner}/{repo}/issues/events"

github_data_access = GithubDataAccess(key_auth, logger)

except Exception as e:
logger.error(f"Could not collect events for {repo_git}\n Reason: {e} \n Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}")
page_count = github_data_access.get_resource_page_count(url)

if page_count > 300:
raise Exception(f"Either github raised the paginator page limit for things like events and messages, or is_pagination_limited_by_max_github_pages is being used on a resource that does not have a page limit. Url: {url}")

def retrieve_all_event_data(repo_git: str, logger, key_auth):
return page_count != 300


def bulk_collect_pr_and_issue_events(repo_git: str, logger, key_auth):

owner, repo = get_owner_repo(repo_git)

logger.info(f"Collecting Github events for {owner}/{repo}")
logger.debug(f"Collecting Github events for {owner}/{repo}")

url = f"https://api.github.com/repos/{owner}/{repo}/issues/events"

github_data_access = GithubDataAccess(key_auth, logger)

event_count = github_data_access.get_resource_page_count(url)
return github_data_access.paginate_resource(url)


def collect_pr_and_issues_events_by_number(repo_id, repo_git: str, logger, key_auth, task_name) -> None:

owner, repo = get_owner_repo(repo_git)

# define logger for task
logger.debug(f"Collecting github events for {owner}/{repo}")

logger.info(f"{owner}/{repo}: Collecting {event_count} github events")
engine = get_engine()

return list(github_data_access.paginate_resource(url))
with engine.connect() as connection:

query = text(f"""
(select pr_src_number as number from pull_requests WHERE repo_id={repo_id} order by pr_created_at desc)
UNION
(select gh_issues_number as number from issues WHERE repo_id={repo_id} order by created_at desc);
""")

result = connection.execute(query).fetchall()
numbers = [x[0] for x in result]

github_data_access = GithubDataAccess(key_auth, logger)
for number in numbers:

event_url = f"https://api.github.com/repos/{owner}/{repo}/issues/{number}/events"

yield from github_data_access.paginate_resource(event_url)

def process_events(events, task_name, repo_id, logger):

Expand Down Expand Up @@ -104,9 +148,7 @@ def process_events(events, task_name, repo_id, logger):
# query = augur_db.session.query(PullRequest).filter(PullRequest.pr_url == pr_url)
# related_pr = execute_session_query(query, 'one')
except KeyError:
logger.info(f"{task_name}: Could not find related pr")
logger.info(f"{task_name}: We were searching for: {pr_url}")
logger.info(f"{task_name}: Skipping")
logger.warning(f"{task_name}: Could not find related pr. We were searching for: {pr_url}")
continue

pr_event_dicts.append(
Expand All @@ -122,9 +164,7 @@ def process_events(events, task_name, repo_id, logger):
# query = augur_db.session.query(Issue).filter(Issue.issue_url == issue_url)
# related_issue = execute_session_query(query, 'one')
except KeyError:
logger.info(f"{task_name}: Could not find related pr")
logger.info(f"{task_name}: We were searching for: {issue_url}")
logger.info(f"{task_name}: Skipping")
logger.warning(f"{task_name}: Could not find related issue. We were searching for: {issue_url}")
continue

issue_event_dicts.append(
Expand Down
7 changes: 1 addition & 6 deletions augur/tasks/github/util/github_data_access.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,17 +65,12 @@ def paginate_resource(self, url):

return

def is_pagination_limited_by_max_github_pages(self, url):

page_count = self.get_resource_page_count(url)

return page_count <= 299

def get_resource_page_count(self, url):

response = self.make_request_with_retries(url, method="HEAD")

if 'last' not in response.links.keys():
self.logger.warning(f"Github response without links. Headers: {response.headers}.")
return 1

try:
Expand Down
Loading