From ac0003210c26426e809609c2a36188b524a93cd0 Mon Sep 17 00:00:00 2001
From: James Maslek <jmaslek11@gmail.com>
Date: Tue, 18 Jun 2024 11:12:13 -0400
Subject: [PATCH] Redirect the lahman database to lahman's  dropbox site

---
 pybaseball/__init__.py |   2 +-
 pybaseball/lahman.py   | 151 ++++++++++++++++++++++++++---------------
 setup.py               |   1 +
 3 files changed, 100 insertions(+), 54 deletions(-)

diff --git a/pybaseball/__init__.py b/pybaseball/__init__.py
index cc223855..410bc416 100644
--- a/pybaseball/__init__.py
+++ b/pybaseball/__init__.py
@@ -79,7 +79,7 @@
 from .lahman import schools
 from .lahman import series_post
 from .lahman import teams_core
-from .lahman import teams_upstream
+# from .lahman import teams_upstream  # Not in the Lahman database
 from .lahman import teams_franchises
 from .lahman import teams_half
 from .lahman import download_lahman
diff --git a/pybaseball/lahman.py b/pybaseball/lahman.py
index 437096eb..c4d3422e 100644
--- a/pybaseball/lahman.py
+++ b/pybaseball/lahman.py
@@ -1,136 +1,181 @@
-from io import BytesIO
+from io import StringIO
 from os import path
 from typing import Optional
-from zipfile import ZipFile
-
+import py7zr
 import pandas as pd
 import requests
 
 from . import cache
 
-url = "https://github.com/chadwickbureau/baseballdatabank/archive/master.zip"
-base_string = "baseballdatabank-master"
+url = "https://www.dropbox.com/scl/fi/hy0sxw6gaai7ghemrshi8/lahman_1871-2023_csv.7z?rlkey=edw1u63zzxg48gvpcmr3qpnhz&e=1&dl=1"
+base_string = "lahman_1871-2023_csv"
 
 _handle = None
 
-def get_lahman_zip() -> Optional[ZipFile]:
-    # Retrieve the Lahman database zip file, returns None if file already exists in cwd.
-    # If we already have the zip file, keep re-using that.
-    # Making this a function since everything else will be re-using these lines
+
+def get_lahman_7z() -> Optional[str]:
+
     global _handle
     if path.exists(path.join(cache.config.cache_directory, base_string)):
         _handle = None
     elif not _handle:
-        s = requests.get(url, stream=True)
-        _handle = ZipFile(BytesIO(s.content))
+        response = requests.get(url, stream=True)
+        local_7z = path.join(cache.config.cache_directory, "lahman_1871-2023_csv.7z")
+        with open(local_7z, "wb") as f:
+            for chunk in response.iter_content(chunk_size=8192):
+                f.write(chunk)
+        _handle = local_7z
     return _handle
 
+
 def download_lahman():
     # download entire lahman db to present working directory
-    z = get_lahman_zip()
+    z = get_lahman_7z()
+    if z is not None:
+        with py7zr.SevenZipFile(z, mode="r") as archive:
+            archive.extractall(path=cache.config.cache_directory)
+
+
+def _get_file(
+    tablename: str, quotechar: str = "'", encoding: str = "latin1"
+) -> pd.DataFrame:
+    z = get_lahman_7z()
+    f = f"{base_string}/{tablename}"
     if z is not None:
-        z.extractall(cache.config.cache_directory)
-        z = get_lahman_zip()
-        # this way we'll now start using the extracted zip directory
-        # instead of the session ZipFile object
-
-def _get_file(tablename: str, quotechar: str = "'") -> pd.DataFrame:
-    z = get_lahman_zip()
-    f = f'{base_string}/{tablename}'
+        download_lahman()
     data = pd.read_csv(
-        f"{path.join(cache.config.cache_directory, f)}" if z is None else z.open(f),
+        f"{path.join(cache.config.cache_directory, f)}",
         header=0,
-        sep=',',
-        quotechar=quotechar
+        sep=",",
+        quotechar=quotechar,
+        encoding=encoding,
     )
     return data
 
 
 # do this for every table in the lahman db so they can exist as separate functions
 def parks() -> pd.DataFrame:
-    return _get_file('core/Parks.csv')
+    return _get_file("Parks.csv")
+
 
 def all_star_full() -> pd.DataFrame:
-    return _get_file("core/AllstarFull.csv")
+    return _get_file("AllstarFull.csv")
+
 
 def appearances() -> pd.DataFrame:
-    return _get_file("core/Appearances.csv")
+    return _get_file("Appearances.csv")
+
 
 def awards_managers() -> pd.DataFrame:
-    return _get_file("contrib/AwardsManagers.csv")
+    return _get_file("AwardsManagers.csv")
+
 
 def awards_players() -> pd.DataFrame:
-    return _get_file("contrib/AwardsPlayers.csv")
+    return _get_file("AwardsPlayers.csv")
+
 
 def awards_share_managers() -> pd.DataFrame:
-    return _get_file("contrib/AwardsShareManagers.csv")
+    return _get_file("AwardsShareManagers.csv")
+
 
 def awards_share_players() -> pd.DataFrame:
-    return _get_file("contrib/AwardsSharePlayers.csv")
+    return _get_file("AwardsSharePlayers.csv")
+
 
 def batting() -> pd.DataFrame:
-    return _get_file("core/Batting.csv")
+    return _get_file("Batting.csv")
+
 
 def batting_post() -> pd.DataFrame:
-    return _get_file("core/BattingPost.csv")
+    return _get_file("BattingPost.csv")
+
 
 def college_playing() -> pd.DataFrame:
-    return _get_file("contrib/CollegePlaying.csv")
+    return _get_file("CollegePlaying.csv")
+
 
 def fielding() -> pd.DataFrame:
-    return _get_file("core/Fielding.csv")
+    return _get_file("Fielding.csv")
+
 
 def fielding_of() -> pd.DataFrame:
-    return _get_file("core/FieldingOF.csv")
+    return _get_file("FieldingOF.csv")
+
 
 def fielding_of_split() -> pd.DataFrame:
-    return _get_file("core/FieldingOFsplit.csv")
+    return _get_file("FieldingOFsplit.csv")
+
 
 def fielding_post() -> pd.DataFrame:
-    return _get_file("core/FieldingPost.csv")
+    return _get_file("FieldingPost.csv")
+
 
 def hall_of_fame() -> pd.DataFrame:
-    return _get_file("contrib/HallOfFame.csv")
+    return _get_file("HallOfFame.csv")
+
 
 def home_games() -> pd.DataFrame:
-    return _get_file("core/HomeGames.csv")
+    return _get_file("HomeGames.csv")
+
 
 def managers() -> pd.DataFrame:
-    return _get_file("core/Managers.csv")
+    return _get_file("Managers.csv")
+
 
 def managers_half() -> pd.DataFrame:
-    return _get_file("core/ManagersHalf.csv")
+    return _get_file("ManagersHalf.csv")
+
 
 def master() -> pd.DataFrame:
     # Alias for people -- the new name for master
     return people()
 
+
 def people() -> pd.DataFrame:
-    return _get_file("core/People.csv")
+    return _get_file("People.csv")
+
 
 def pitching() -> pd.DataFrame:
-    return _get_file("core/Pitching.csv")
+    return _get_file("Pitching.csv")
+
 
 def pitching_post() -> pd.DataFrame:
-    return _get_file("core/PitchingPost.csv")
+    return _get_file("PitchingPost.csv")
+
 
 def salaries() -> pd.DataFrame:
-    return _get_file("contrib/Salaries.csv")
+    return _get_file("Salaries.csv")
 
-def schools() -> pd.DataFrame:
-    return _get_file("contrib/Schools.csv", quotechar='"')  # different here bc of doublequotes used in some school names
 
 def series_post() -> pd.DataFrame:
-    return _get_file("core/SeriesPost.csv")
+    return _get_file("SeriesPost.csv")
+
 
 def teams_core() -> pd.DataFrame:
-    return _get_file("core/Teams.csv")
+    return _get_file("Teams.csv")
+
+
+# def teams_upstream() -> pd.DataFrame:
+#     return _get_file("upstream/Teams.csv") # manually maintained file
 
-def teams_upstream() -> pd.DataFrame:
-    return _get_file("upstream/Teams.csv") # manually maintained file
 
 def teams_franchises() -> pd.DataFrame:
-    return _get_file("core/TeamsFranchises.csv")
+    return _get_file("TeamsFranchises.csv")
+
 
 def teams_half() -> pd.DataFrame:
-    return _get_file("core/TeamsHalf.csv")
+    return _get_file("TeamsHalf.csv")
+
+
+def schools() -> pd.DataFrame:
+    """Clean up the schools to allows pandas reading"""
+    f = f"{base_string}/Schools.csv"
+    file = f"{path.join(cache.config.cache_directory, f)}"
+    if not path.exists(file):
+        download_lahman()
+    with open(file, "r") as f:
+        csv_data = f.read()
+    csv_data = csv_data.replace(", ", " ")
+    return pd.read_csv(
+        StringIO(csv_data), header=0, sep=",", quotechar='"', encoding="latin1"
+    )
diff --git a/setup.py b/setup.py
index 2d76831a..4689db31 100644
--- a/setup.py
+++ b/setup.py
@@ -92,6 +92,7 @@
                       'matplotlib>=2.0.0',
                       'tqdm>=4.50.0',
                       'attrs>=20.3.0',
+                      'py7zr>=0.21.0'
                       ],
 
     # List additional groups of dependencies here (e.g. development