Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix the Lahman Database Scraping #434

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pybaseball/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@
from .lahman import schools
from .lahman import series_post
from .lahman import teams_core
from .lahman import teams_upstream
# from .lahman import teams_upstream # Not in the Lahman database
from .lahman import teams_franchises
from .lahman import teams_half
from .lahman import download_lahman
Expand Down
151 changes: 98 additions & 53 deletions pybaseball/lahman.py
Original file line number Diff line number Diff line change
@@ -1,136 +1,181 @@
from io import BytesIO
from io import StringIO
from os import path
from typing import Optional
from zipfile import ZipFile

import py7zr
import pandas as pd
import requests

from . import cache

url = "https://github.com/chadwickbureau/baseballdatabank/archive/master.zip"
base_string = "baseballdatabank-master"
url = "https://www.dropbox.com/scl/fi/hy0sxw6gaai7ghemrshi8/lahman_1871-2023_csv.7z?rlkey=edw1u63zzxg48gvpcmr3qpnhz&e=1&dl=1"
base_string = "lahman_1871-2023_csv"

_handle = None

def get_lahman_zip() -> Optional[ZipFile]:
# Retrieve the Lahman database zip file, returns None if file already exists in cwd.
# If we already have the zip file, keep re-using that.
# Making this a function since everything else will be re-using these lines

def get_lahman_7z() -> Optional[str]:

global _handle
if path.exists(path.join(cache.config.cache_directory, base_string)):
_handle = None
elif not _handle:
s = requests.get(url, stream=True)
_handle = ZipFile(BytesIO(s.content))
response = requests.get(url, stream=True)
local_7z = path.join(cache.config.cache_directory, "lahman_1871-2023_csv.7z")
with open(local_7z, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
_handle = local_7z
return _handle


def download_lahman():
# download entire lahman db to present working directory
z = get_lahman_zip()
z = get_lahman_7z()
if z is not None:
with py7zr.SevenZipFile(z, mode="r") as archive:
archive.extractall(path=cache.config.cache_directory)


def _get_file(
tablename: str, quotechar: str = "'", encoding: str = "latin1"
) -> pd.DataFrame:
z = get_lahman_7z()
f = f"{base_string}/{tablename}"
if z is not None:
z.extractall(cache.config.cache_directory)
z = get_lahman_zip()
# this way we'll now start using the extracted zip directory
# instead of the session ZipFile object

def _get_file(tablename: str, quotechar: str = "'") -> pd.DataFrame:
z = get_lahman_zip()
f = f'{base_string}/{tablename}'
download_lahman()
data = pd.read_csv(
f"{path.join(cache.config.cache_directory, f)}" if z is None else z.open(f),
f"{path.join(cache.config.cache_directory, f)}",
header=0,
sep=',',
quotechar=quotechar
sep=",",
quotechar=quotechar,
encoding=encoding,
)
return data


# do this for every table in the lahman db so they can exist as separate functions
def parks() -> pd.DataFrame:
return _get_file('core/Parks.csv')
return _get_file("Parks.csv")


def all_star_full() -> pd.DataFrame:
return _get_file("core/AllstarFull.csv")
return _get_file("AllstarFull.csv")


def appearances() -> pd.DataFrame:
return _get_file("core/Appearances.csv")
return _get_file("Appearances.csv")


def awards_managers() -> pd.DataFrame:
return _get_file("contrib/AwardsManagers.csv")
return _get_file("AwardsManagers.csv")


def awards_players() -> pd.DataFrame:
return _get_file("contrib/AwardsPlayers.csv")
return _get_file("AwardsPlayers.csv")


def awards_share_managers() -> pd.DataFrame:
return _get_file("contrib/AwardsShareManagers.csv")
return _get_file("AwardsShareManagers.csv")


def awards_share_players() -> pd.DataFrame:
return _get_file("contrib/AwardsSharePlayers.csv")
return _get_file("AwardsSharePlayers.csv")


def batting() -> pd.DataFrame:
return _get_file("core/Batting.csv")
return _get_file("Batting.csv")


def batting_post() -> pd.DataFrame:
return _get_file("core/BattingPost.csv")
return _get_file("BattingPost.csv")


def college_playing() -> pd.DataFrame:
return _get_file("contrib/CollegePlaying.csv")
return _get_file("CollegePlaying.csv")


def fielding() -> pd.DataFrame:
return _get_file("core/Fielding.csv")
return _get_file("Fielding.csv")


def fielding_of() -> pd.DataFrame:
return _get_file("core/FieldingOF.csv")
return _get_file("FieldingOF.csv")


def fielding_of_split() -> pd.DataFrame:
return _get_file("core/FieldingOFsplit.csv")
return _get_file("FieldingOFsplit.csv")


def fielding_post() -> pd.DataFrame:
return _get_file("core/FieldingPost.csv")
return _get_file("FieldingPost.csv")


def hall_of_fame() -> pd.DataFrame:
return _get_file("contrib/HallOfFame.csv")
return _get_file("HallOfFame.csv")


def home_games() -> pd.DataFrame:
return _get_file("core/HomeGames.csv")
return _get_file("HomeGames.csv")


def managers() -> pd.DataFrame:
return _get_file("core/Managers.csv")
return _get_file("Managers.csv")


def managers_half() -> pd.DataFrame:
return _get_file("core/ManagersHalf.csv")
return _get_file("ManagersHalf.csv")


def master() -> pd.DataFrame:
# Alias for people -- the new name for master
return people()


def people() -> pd.DataFrame:
return _get_file("core/People.csv")
return _get_file("People.csv")


def pitching() -> pd.DataFrame:
return _get_file("core/Pitching.csv")
return _get_file("Pitching.csv")


def pitching_post() -> pd.DataFrame:
return _get_file("core/PitchingPost.csv")
return _get_file("PitchingPost.csv")


def salaries() -> pd.DataFrame:
return _get_file("contrib/Salaries.csv")
return _get_file("Salaries.csv")

def schools() -> pd.DataFrame:
return _get_file("contrib/Schools.csv", quotechar='"') # different here bc of doublequotes used in some school names

def series_post() -> pd.DataFrame:
return _get_file("core/SeriesPost.csv")
return _get_file("SeriesPost.csv")


def teams_core() -> pd.DataFrame:
return _get_file("core/Teams.csv")
return _get_file("Teams.csv")


# def teams_upstream() -> pd.DataFrame:
# return _get_file("upstream/Teams.csv") # manually maintained file

def teams_upstream() -> pd.DataFrame:
return _get_file("upstream/Teams.csv") # manually maintained file

def teams_franchises() -> pd.DataFrame:
return _get_file("core/TeamsFranchises.csv")
return _get_file("TeamsFranchises.csv")


def teams_half() -> pd.DataFrame:
return _get_file("core/TeamsHalf.csv")
return _get_file("TeamsHalf.csv")


def schools() -> pd.DataFrame:
"""Clean up the schools to allows pandas reading"""
f = f"{base_string}/Schools.csv"
file = f"{path.join(cache.config.cache_directory, f)}"
if not path.exists(file):
download_lahman()
with open(file, "r") as f:
csv_data = f.read()
csv_data = csv_data.replace(", ", " ")
return pd.read_csv(
StringIO(csv_data), header=0, sep=",", quotechar='"', encoding="latin1"
)
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@
'matplotlib>=2.0.0',
'tqdm>=4.50.0',
'attrs>=20.3.0',
'py7zr>=0.21.0'
],

# List additional groups of dependencies here (e.g. development
Expand Down