jldbc · jmaslek · Jun 18, 2024
diff --git a/pybaseball/__init__.py b/pybaseball/__init__.py
@@ -79,7 +79,7 @@
 from .lahman import schools
 from .lahman import series_post
 from .lahman import teams_core
-from .lahman import teams_upstream
+# from .lahman import teams_upstream # Not in the Lahman database
 from .lahman import teams_franchises
 from .lahman import teams_half
 from .lahman import download_lahman

diff --git a/pybaseball/lahman.py b/pybaseball/lahman.py
@@ -1,136 +1,181 @@
-from io import BytesIO
+from io import StringIO
 from os import path
 from typing import Optional
-from zipfile import ZipFile
-
+import py7zr
 import pandas as pd
 import requests
 
 from . import cache
 
-url = "https://github.com/chadwickbureau/baseballdatabank/archive/master.zip"
-base_string = "baseballdatabank-master"
+url = "https://www.dropbox.com/scl/fi/hy0sxw6gaai7ghemrshi8/lahman_1871-2023_csv.7z?rlkey=edw1u63zzxg48gvpcmr3qpnhz&e=1&dl=1"
+base_string = "lahman_1871-2023_csv"
 
 _handle = None
 
-def get_lahman_zip() -> Optional[ZipFile]:
- # Retrieve the Lahman database zip file, returns None if file already exists in cwd.
- # If we already have the zip file, keep re-using that.
- # Making this a function since everything else will be re-using these lines
+
+def get_lahman_7z() -> Optional[str]:
+
  global _handle
  if path.exists(path.join(cache.config.cache_directory, base_string)):
  _handle = None
  elif not _handle:
- s = requests.get(url, stream=True)
- _handle = ZipFile(BytesIO(s.content))
+ response = requests.get(url, stream=True)
+ local_7z = path.join(cache.config.cache_directory, "lahman_1871-2023_csv.7z")
+ with open(local_7z, "wb") as f:
+ for chunk in response.iter_content(chunk_size=8192):
+ f.write(chunk)
+ _handle = local_7z
  return _handle
 
+
 def download_lahman():
  # download entire lahman db to present working directory
- z = get_lahman_zip()
+ z = get_lahman_7z()
+ if z is not None:
+ with py7zr.SevenZipFile(z, mode="r") as archive:
+ archive.extractall(path=cache.config.cache_directory)
+
+
+def _get_file(
+ tablename: str, quotechar: str = "'", encoding: str = "latin1"
+) -> pd.DataFrame:
+ z = get_lahman_7z()
+ f = f"{base_string}/{tablename}"
  if z is not None:
- z.extractall(cache.config.cache_directory)
- z = get_lahman_zip()
- # this way we'll now start using the extracted zip directory
- # instead of the session ZipFile object
-
-def _get_file(tablename: str, quotechar: str = "'") -> pd.DataFrame:
- z = get_lahman_zip()
- f = f'{base_string}/{tablename}'
+ download_lahman()
  data = pd.read_csv(
- f"{path.join(cache.config.cache_directory, f)}" if z is None else z.open(f),
+ f"{path.join(cache.config.cache_directory, f)}",
  header=0,
- sep=',',
- quotechar=quotechar
+ sep=",",
+ quotechar=quotechar,
+ encoding=encoding,
  )
  return data
 
 
 # do this for every table in the lahman db so they can exist as separate functions
 def parks() -> pd.DataFrame:
- return _get_file('core/Parks.csv')
+ return _get_file("Parks.csv")
+
 
 def all_star_full() -> pd.DataFrame:
- return _get_file("core/AllstarFull.csv")
+ return _get_file("AllstarFull.csv")
+
 
 def appearances() -> pd.DataFrame:
- return _get_file("core/Appearances.csv")
+ return _get_file("Appearances.csv")
+
 
 def awards_managers() -> pd.DataFrame:
- return _get_file("contrib/AwardsManagers.csv")
+ return _get_file("AwardsManagers.csv")
+
 
 def awards_players() -> pd.DataFrame:
- return _get_file("contrib/AwardsPlayers.csv")
+ return _get_file("AwardsPlayers.csv")
+
 
 def awards_share_managers() -> pd.DataFrame:
- return _get_file("contrib/AwardsShareManagers.csv")
+ return _get_file("AwardsShareManagers.csv")
+
 
 def awards_share_players() -> pd.DataFrame:
- return _get_file("contrib/AwardsSharePlayers.csv")
+ return _get_file("AwardsSharePlayers.csv")
+
 
 def batting() -> pd.DataFrame:
- return _get_file("core/Batting.csv")
+ return _get_file("Batting.csv")
+
 
 def batting_post() -> pd.DataFrame:
- return _get_file("core/BattingPost.csv")
+ return _get_file("BattingPost.csv")
+
 
 def college_playing() -> pd.DataFrame:
- return _get_file("contrib/CollegePlaying.csv")
+ return _get_file("CollegePlaying.csv")
+
 
 def fielding() -> pd.DataFrame:
- return _get_file("core/Fielding.csv")
+ return _get_file("Fielding.csv")
+
 
 def fielding_of() -> pd.DataFrame:
- return _get_file("core/FieldingOF.csv")
+ return _get_file("FieldingOF.csv")
+
 
 def fielding_of_split() -> pd.DataFrame:
- return _get_file("core/FieldingOFsplit.csv")
+ return _get_file("FieldingOFsplit.csv")
+
 
 def fielding_post() -> pd.DataFrame:
- return _get_file("core/FieldingPost.csv")
+ return _get_file("FieldingPost.csv")
+
 
 def hall_of_fame() -> pd.DataFrame:
- return _get_file("contrib/HallOfFame.csv")
+ return _get_file("HallOfFame.csv")
+
 
 def home_games() -> pd.DataFrame:
- return _get_file("core/HomeGames.csv")
+ return _get_file("HomeGames.csv")
+
 
 def managers() -> pd.DataFrame:
- return _get_file("core/Managers.csv")
+ return _get_file("Managers.csv")
+
 
 def managers_half() -> pd.DataFrame:
- return _get_file("core/ManagersHalf.csv")
+ return _get_file("ManagersHalf.csv")
+
 
 def master() -> pd.DataFrame:
  # Alias for people -- the new name for master
  return people()
 
+
 def people() -> pd.DataFrame:
- return _get_file("core/People.csv")
+ return _get_file("People.csv")
+
 
 def pitching() -> pd.DataFrame:
- return _get_file("core/Pitching.csv")
+ return _get_file("Pitching.csv")
+
 
 def pitching_post() -> pd.DataFrame:
- return _get_file("core/PitchingPost.csv")
+ return _get_file("PitchingPost.csv")
+
 
 def salaries() -> pd.DataFrame:
- return _get_file("contrib/Salaries.csv")
+ return _get_file("Salaries.csv")
 
-def schools() -> pd.DataFrame:
- return _get_file("contrib/Schools.csv", quotechar='"') # different here bc of doublequotes used in some school names
 
 def series_post() -> pd.DataFrame:
- return _get_file("core/SeriesPost.csv")
+ return _get_file("SeriesPost.csv")
+
 
 def teams_core() -> pd.DataFrame:
- return _get_file("core/Teams.csv")
+ return _get_file("Teams.csv")
+
+
+# def teams_upstream() -> pd.DataFrame:
+# return _get_file("upstream/Teams.csv") # manually maintained file
 
-def teams_upstream() -> pd.DataFrame:
- return _get_file("upstream/Teams.csv") # manually maintained file
 
 def teams_franchises() -> pd.DataFrame:
- return _get_file("core/TeamsFranchises.csv")
+ return _get_file("TeamsFranchises.csv")
+
 
 def teams_half() -> pd.DataFrame:
- return _get_file("core/TeamsHalf.csv")
+ return _get_file("TeamsHalf.csv")
+
+
+def schools() -> pd.DataFrame:
+ """Clean up the schools to allows pandas reading"""
+ f = f"{base_string}/Schools.csv"
+ file = f"{path.join(cache.config.cache_directory, f)}"
+ if not path.exists(file):
+ download_lahman()
+ with open(file, "r") as f:
+ csv_data = f.read()
+ csv_data = csv_data.replace(", ", " ")
+ return pd.read_csv(
+ StringIO(csv_data), header=0, sep=",", quotechar='"', encoding="latin1"
+ )
diff --git a/setup.py b/setup.py
@@ -92,6 +92,7 @@
  'matplotlib>=2.0.0',
  'tqdm>=4.50.0',
  'attrs>=20.3.0',
+ 'py7zr>=0.21.0'
  ],
 
  # List additional groups of dependencies here (e.g. development