From ac0003210c26426e809609c2a36188b524a93cd0 Mon Sep 17 00:00:00 2001 From: James Maslek Date: Tue, 18 Jun 2024 11:12:13 -0400 Subject: [PATCH] Redirect the lahman database to lahman's dropbox site --- pybaseball/__init__.py | 2 +- pybaseball/lahman.py | 151 ++++++++++++++++++++++++++--------------- setup.py | 1 + 3 files changed, 100 insertions(+), 54 deletions(-) diff --git a/pybaseball/__init__.py b/pybaseball/__init__.py index cc223855..410bc416 100644 --- a/pybaseball/__init__.py +++ b/pybaseball/__init__.py @@ -79,7 +79,7 @@ from .lahman import schools from .lahman import series_post from .lahman import teams_core -from .lahman import teams_upstream +# from .lahman import teams_upstream # Not in the Lahman database from .lahman import teams_franchises from .lahman import teams_half from .lahman import download_lahman diff --git a/pybaseball/lahman.py b/pybaseball/lahman.py index 437096eb..c4d3422e 100644 --- a/pybaseball/lahman.py +++ b/pybaseball/lahman.py @@ -1,136 +1,181 @@ -from io import BytesIO +from io import StringIO from os import path from typing import Optional -from zipfile import ZipFile - +import py7zr import pandas as pd import requests from . import cache -url = "https://github.com/chadwickbureau/baseballdatabank/archive/master.zip" -base_string = "baseballdatabank-master" +url = "https://www.dropbox.com/scl/fi/hy0sxw6gaai7ghemrshi8/lahman_1871-2023_csv.7z?rlkey=edw1u63zzxg48gvpcmr3qpnhz&e=1&dl=1" +base_string = "lahman_1871-2023_csv" _handle = None -def get_lahman_zip() -> Optional[ZipFile]: - # Retrieve the Lahman database zip file, returns None if file already exists in cwd. - # If we already have the zip file, keep re-using that. - # Making this a function since everything else will be re-using these lines + +def get_lahman_7z() -> Optional[str]: + global _handle if path.exists(path.join(cache.config.cache_directory, base_string)): _handle = None elif not _handle: - s = requests.get(url, stream=True) - _handle = ZipFile(BytesIO(s.content)) + response = requests.get(url, stream=True) + local_7z = path.join(cache.config.cache_directory, "lahman_1871-2023_csv.7z") + with open(local_7z, "wb") as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + _handle = local_7z return _handle + def download_lahman(): # download entire lahman db to present working directory - z = get_lahman_zip() + z = get_lahman_7z() + if z is not None: + with py7zr.SevenZipFile(z, mode="r") as archive: + archive.extractall(path=cache.config.cache_directory) + + +def _get_file( + tablename: str, quotechar: str = "'", encoding: str = "latin1" +) -> pd.DataFrame: + z = get_lahman_7z() + f = f"{base_string}/{tablename}" if z is not None: - z.extractall(cache.config.cache_directory) - z = get_lahman_zip() - # this way we'll now start using the extracted zip directory - # instead of the session ZipFile object - -def _get_file(tablename: str, quotechar: str = "'") -> pd.DataFrame: - z = get_lahman_zip() - f = f'{base_string}/{tablename}' + download_lahman() data = pd.read_csv( - f"{path.join(cache.config.cache_directory, f)}" if z is None else z.open(f), + f"{path.join(cache.config.cache_directory, f)}", header=0, - sep=',', - quotechar=quotechar + sep=",", + quotechar=quotechar, + encoding=encoding, ) return data # do this for every table in the lahman db so they can exist as separate functions def parks() -> pd.DataFrame: - return _get_file('core/Parks.csv') + return _get_file("Parks.csv") + def all_star_full() -> pd.DataFrame: - return _get_file("core/AllstarFull.csv") + return _get_file("AllstarFull.csv") + def appearances() -> pd.DataFrame: - return _get_file("core/Appearances.csv") + return _get_file("Appearances.csv") + def awards_managers() -> pd.DataFrame: - return _get_file("contrib/AwardsManagers.csv") + return _get_file("AwardsManagers.csv") + def awards_players() -> pd.DataFrame: - return _get_file("contrib/AwardsPlayers.csv") + return _get_file("AwardsPlayers.csv") + def awards_share_managers() -> pd.DataFrame: - return _get_file("contrib/AwardsShareManagers.csv") + return _get_file("AwardsShareManagers.csv") + def awards_share_players() -> pd.DataFrame: - return _get_file("contrib/AwardsSharePlayers.csv") + return _get_file("AwardsSharePlayers.csv") + def batting() -> pd.DataFrame: - return _get_file("core/Batting.csv") + return _get_file("Batting.csv") + def batting_post() -> pd.DataFrame: - return _get_file("core/BattingPost.csv") + return _get_file("BattingPost.csv") + def college_playing() -> pd.DataFrame: - return _get_file("contrib/CollegePlaying.csv") + return _get_file("CollegePlaying.csv") + def fielding() -> pd.DataFrame: - return _get_file("core/Fielding.csv") + return _get_file("Fielding.csv") + def fielding_of() -> pd.DataFrame: - return _get_file("core/FieldingOF.csv") + return _get_file("FieldingOF.csv") + def fielding_of_split() -> pd.DataFrame: - return _get_file("core/FieldingOFsplit.csv") + return _get_file("FieldingOFsplit.csv") + def fielding_post() -> pd.DataFrame: - return _get_file("core/FieldingPost.csv") + return _get_file("FieldingPost.csv") + def hall_of_fame() -> pd.DataFrame: - return _get_file("contrib/HallOfFame.csv") + return _get_file("HallOfFame.csv") + def home_games() -> pd.DataFrame: - return _get_file("core/HomeGames.csv") + return _get_file("HomeGames.csv") + def managers() -> pd.DataFrame: - return _get_file("core/Managers.csv") + return _get_file("Managers.csv") + def managers_half() -> pd.DataFrame: - return _get_file("core/ManagersHalf.csv") + return _get_file("ManagersHalf.csv") + def master() -> pd.DataFrame: # Alias for people -- the new name for master return people() + def people() -> pd.DataFrame: - return _get_file("core/People.csv") + return _get_file("People.csv") + def pitching() -> pd.DataFrame: - return _get_file("core/Pitching.csv") + return _get_file("Pitching.csv") + def pitching_post() -> pd.DataFrame: - return _get_file("core/PitchingPost.csv") + return _get_file("PitchingPost.csv") + def salaries() -> pd.DataFrame: - return _get_file("contrib/Salaries.csv") + return _get_file("Salaries.csv") -def schools() -> pd.DataFrame: - return _get_file("contrib/Schools.csv", quotechar='"') # different here bc of doublequotes used in some school names def series_post() -> pd.DataFrame: - return _get_file("core/SeriesPost.csv") + return _get_file("SeriesPost.csv") + def teams_core() -> pd.DataFrame: - return _get_file("core/Teams.csv") + return _get_file("Teams.csv") + + +# def teams_upstream() -> pd.DataFrame: +# return _get_file("upstream/Teams.csv") # manually maintained file -def teams_upstream() -> pd.DataFrame: - return _get_file("upstream/Teams.csv") # manually maintained file def teams_franchises() -> pd.DataFrame: - return _get_file("core/TeamsFranchises.csv") + return _get_file("TeamsFranchises.csv") + def teams_half() -> pd.DataFrame: - return _get_file("core/TeamsHalf.csv") + return _get_file("TeamsHalf.csv") + + +def schools() -> pd.DataFrame: + """Clean up the schools to allows pandas reading""" + f = f"{base_string}/Schools.csv" + file = f"{path.join(cache.config.cache_directory, f)}" + if not path.exists(file): + download_lahman() + with open(file, "r") as f: + csv_data = f.read() + csv_data = csv_data.replace(", ", " ") + return pd.read_csv( + StringIO(csv_data), header=0, sep=",", quotechar='"', encoding="latin1" + ) diff --git a/setup.py b/setup.py index 2d76831a..4689db31 100644 --- a/setup.py +++ b/setup.py @@ -92,6 +92,7 @@ 'matplotlib>=2.0.0', 'tqdm>=4.50.0', 'attrs>=20.3.0', + 'py7zr>=0.21.0' ], # List additional groups of dependencies here (e.g. development