jldbc · pinsondg · Oct 4, 2023
diff --git a/pybaseball/__init__.py b/pybaseball/__init__.py
@@ -7,29 +7,29 @@
 from .teamid_lookup import team_ids
 from .statcast import statcast, statcast_single_game
 from .statcast_pitcher import (
- statcast_pitcher,
- statcast_pitcher_exitvelo_barrels,
- statcast_pitcher_expected_stats,
- statcast_pitcher_pitch_arsenal,
- statcast_pitcher_arsenal_stats,
- statcast_pitcher_percentile_ranks,
- statcast_pitcher_spin_dir_comp
+ statcast_pitcher,
+ statcast_pitcher_exitvelo_barrels,
+ statcast_pitcher_expected_stats,
+ statcast_pitcher_pitch_arsenal,
+ statcast_pitcher_arsenal_stats,
+ statcast_pitcher_percentile_ranks,
+ statcast_pitcher_spin_dir_comp
 )
 from .statcast_batter import (
- statcast_batter,
- statcast_batter_exitvelo_barrels,
- statcast_batter_expected_stats,
- statcast_batter_percentile_ranks,
- statcast_batter_pitch_arsenal
+ statcast_batter,
+ statcast_batter_exitvelo_barrels,
+ statcast_batter_expected_stats,
+ statcast_batter_percentile_ranks,
+ statcast_batter_pitch_arsenal
 )
 from .statcast_running import statcast_sprint_speed, statcast_running_splits
 from .statcast_fielding import (
- statcast_outs_above_average,
- statcast_outfield_directional_oaa,
- statcast_outfield_catch_prob,
- statcast_outfielder_jump,
- statcast_catcher_poptime,
- statcast_catcher_framing
+ statcast_outs_above_average,
+ statcast_outfield_directional_oaa,
+ statcast_outfield_catch_prob,
+ statcast_outfielder_jump,
+ statcast_catcher_poptime,
+ statcast_catcher_framing
 )
 from .league_batting_stats import batting_stats_bref
 from .league_batting_stats import batting_stats_range
@@ -100,3 +100,6 @@
  fg_team_pitching_data)
 from .split_stats import get_splits
 from .version import __version__
+import logging
+
+package_logger = logging.getLogger(__name__)
diff --git a/pybaseball/league_pitching_stats.py b/pybaseball/league_pitching_stats.py
@@ -1,4 +1,5 @@
 from datetime import date
+import logging
 import io
 from typing import Optional, Union
 
@@ -11,14 +12,16 @@
 from .datasources.bref import BRefSession
 
 session = BRefSession()
+logger = logging.getLogger(__name__)
 
 
 def get_soup(start_dt: Optional[Union[date, str]], end_dt: Optional[Union[date, str]]) -> BeautifulSoup:
  # get most recent standings if date not specified
- if((start_dt is None) or (end_dt is None)):
- print('Error: a date range needs to be specified')
+ if ((start_dt is None) or (end_dt is None)):
+ logger.error('a date range needs to be specified')
  return None
- url = "http://www.baseball-reference.com/leagues/daily.cgi?user_team=&bust_cache=&type=p&lastndays=7&dates=fromandto&fromandto={}.{}&level=mlb&franch=&stat=&stat_value=0".format(start_dt, end_dt)
+ url = "http://www.baseball-reference.com/leagues/daily.cgi?user_team=&bust_cache=&type=p&lastndays=7&dates=fromandto&fromandto={}.{}&level=mlb&franch=&stat=&stat_value=0".format(
+ start_dt, end_dt)
  s = session.get(url).content
  # a workaround to avoid beautiful soup applying the wrong encoding
  s = str(s).encode()
@@ -47,7 +50,7 @@ def get_table(soup: BeautifulSoup) -> pd.DataFrame:
 
 
 @cache.df_cache()
-def pitching_stats_range(start_dt: Optional[str]=None, end_dt: Optional[str]=None) -> pd.DataFrame:
+def pitching_stats_range(start_dt: Optional[str] = None, end_dt: Optional[str] = None) -> pd.DataFrame:
  """
  Get all pitching stats for a set time range. This can be the past week, the
  month of August, anything. Just supply the start and end date in YYYY-MM-DD
@@ -62,36 +65,37 @@ def pitching_stats_range(start_dt: Optional[str]=None, end_dt: Optional[str]=Non
  # retrieve html from baseball reference
  soup = get_soup(start_dt_date, end_dt_date)
  table = get_table(soup)
- table = table.dropna(how='all') # drop if all columns are NA
- #fix some strange formatting for percentage columns
+ table = table.dropna(how='all')  # drop if all columns are NA
+ # fix some strange formatting for percentage columns
  table = table.replace('---%', np.nan)
- #make sure these are all numeric
+ # make sure these are all numeric
  for column in ['Age', '#days', 'G', 'GS', 'W', 'L', 'SV', 'IP', 'H',
-  'R', 'ER', 'BB', 'SO', 'HR', 'HBP', 'ERA', 'AB', '2B',
-  '3B', 'IBB', 'GDP', 'SF', 'SB', 'CS', 'PO', 'BF', 'Pit',
-  'WHIP', 'BAbip', 'SO9', 'SO/W']:
+ 'R', 'ER', 'BB', 'SO', 'HR', 'HBP', 'ERA', 'AB', '2B',
+ '3B', 'IBB', 'GDP', 'SF', 'SB', 'CS', 'PO', 'BF', 'Pit',
+ 'WHIP', 'BAbip', 'SO9', 'SO/W']:
  table[column] = pd.to_numeric(table[column])
- #convert str(xx%) values to float(0.XX) decimal values
+ # convert str(xx%) values to float(0.XX) decimal values
  for column in ['Str', 'StL', 'StS', 'GB/FB', 'LD', 'PU']:
- table[column] = table[column].replace('%','',regex=True).astype('float')/100
+ table[column] = table[column].replace('%', '', regex=True).astype('float') / 100
 
  table = table.drop('', axis=1)
  return table
 
-def pitching_stats_bref(season: Optional[int]=None) -> pd.DataFrame:
+
+def pitching_stats_bref(season: Optional[int] = None) -> pd.DataFrame:
  """
  Get all pitching stats for a set season. If no argument is supplied, gives stats for
  current season to date.
  """
  if season is None:
  season = most_recent_season()
  str_season = str(season)
- start_dt = str_season + '-03-01' #opening day is always late march or early april
- end_dt = str_season + '-11-30' #postseason is definitely over by end of November
- return(pitching_stats_range(start_dt, end_dt))
+ start_dt = str_season + '-03-01'  # opening day is always late march or early april
+ end_dt = str_season + '-11-30'  # postseason is definitely over by end of November
+ return (pitching_stats_range(start_dt, end_dt))
 
 
-def bwar_pitch(return_all: bool=False) -> pd.DataFrame:
+def bwar_pitch(return_all: bool = False) -> pd.DataFrame:
  """
  Get data from war_daily_pitch table. Returns WAR, its components, and a few other useful stats.
  To get all fields from this table, supply argument return_all=True.
@@ -103,6 +107,6 @@ def bwar_pitch(return_all: bool=False) -> pd.DataFrame:
  return c
  else:
  cols_to_keep = ['name_common', 'mlb_ID', 'player_ID', 'year_ID', 'team_ID', 'stint_ID', 'lg_ID',
- 'G', 'GS', 'RA','xRA', 'BIP', 'BIP_perc','salary', 'ERA_plus', 'WAR_rep', 'WAA',
- 'WAA_adj','WAR']
+ 'G', 'GS', 'RA', 'xRA', 'BIP', 'BIP_perc', 'salary', 'ERA_plus', 'WAR_rep', 'WAA',
+ 'WAA_adj', 'WAR']
  return c[cols_to_keep]
diff --git a/pybaseball/playerid_lookup.py b/pybaseball/playerid_lookup.py
@@ -3,6 +3,7 @@
 import os
 import re
 import zipfile
+import logging
 
 from typing import List, Tuple, Iterable
 
@@ -15,6 +16,7 @@
 PEOPLE_FILE_PATTERN = re.compile("/people.+csv$")
 
 _client = None
+logger = logging.getLogger(__name__)
 
 
 def get_register_file():
@@ -47,7 +49,7 @@ def chadwick_register(save: bool = False) -> pd.DataFrame:
  table = pd.read_csv(get_register_file())
  return table
 
- print('Gathering player lookup table. This may take a moment.')
+ logger.info('Gathering player lookup table. This may take a moment.')
  s = requests.get(url).content
  mlb_only_cols = ['key_retro', 'key_bbref', 'key_fangraphs', 'mlb_played_first', 'mlb_played_last']
  cols_to_keep = ['name_last', 'name_first', 'key_mlbam'] + mlb_only_cols
@@ -126,7 +128,7 @@ def search(self, last: str, first: str = None, fuzzy: bool = False) -> pd.DataFr
 
  # If no matches, return 5 closest names
  if len(results) == 0 and fuzzy:
- print("No identically matched names found! Returning the 5 most similar names.")
+ logger.warning("No identically matched names found! Returning the 5 most similar names.")
  results=get_closest_names(last=last, first=first, player_table=self.table)
 
  return results

diff --git a/pybaseball/retrosheet.py b/pybaseball/retrosheet.py
@@ -21,12 +21,11 @@
 discovers discrepancies and we appreciate learning of the details.
 """
 import pandas as pd
+import logging
 from pybaseball.utils import get_text_file
-from datetime import datetime
 from io import StringIO
 from github import Github
 import os
-from getpass import getuser, getpass
 from github.GithubException import RateLimitExceededException
 import warnings
 
@@ -106,6 +105,9 @@
 roster_url = 'https://raw.githubusercontent.com/chadwickbureau/retrosheet/master/rosters/{}{}.ROS'
 event_url = 'https://raw.githubusercontent.com/chadwickbureau/retrosheet/master/event/{}/{}'
 
+logger = logging.getLogger(__name__)
+
+
 def events(season, type='regular', export_dir='.'):
  """
  Pulls retrosheet event files for an entire season. The `type` argument
@@ -115,7 +117,7 @@ def events(season, type='regular', export_dir='.'):
  Right now, pybaseball does not parse the retrosheet files but downloads and
  saves them.
  """
- GH_TOKEN=os.getenv('GH_TOKEN', '')
+ GH_TOKEN = os.getenv('GH_TOKEN', '')
  if not os.path.exists(export_dir):
  os.mkdir(export_dir)
 
@@ -142,16 +144,17 @@ def events(season, type='regular', export_dir='.'):
  )
 
  for filename in event_files:
- print(f'Downloading {filename}')
+ logger.info('Downloading %s', filename)
  s = get_text_file(event_url.format(type, filename))
  with open(os.path.join(export_dir, filename), 'w') as f:
  f.write(s)
 
+
 def rosters(season):
  """
  Pulls retrosheet roster files for an entire season
  """
- GH_TOKEN=os.getenv('GH_TOKEN', '')
+ GH_TOKEN = os.getenv('GH_TOKEN', '')
 
  try:
  g = Github(GH_TOKEN)
@@ -170,15 +173,16 @@ def rosters(season):
  UserWarning
  )
 
- df_list = [_roster(team = r[:3], season = season, checked=False) for r in rosters]
+ df_list = [_roster(team=r[:3], season=season, checked=False) for r in rosters]
 
  return pd.concat(df_list)
 
-def _roster(team, season, checked = False):
+
+def _roster(team, season, checked=False):
  """
  Pulls retrosheet roster files
  """
- GH_TOKEN=os.getenv('GH_TOKEN', '')
+ GH_TOKEN = os.getenv('GH_TOKEN', '')
 
  if not checked:
  g = Github(GH_TOKEN)
@@ -204,6 +208,7 @@ def _roster(team, season, checked = False):
  data.columns = roster_columns
  return data
 
+
 def park_codes():
  """
  Pulls retrosheet Park IDs
@@ -213,15 +218,16 @@ def park_codes():
  data.columns = parkcode_columns
  return data
 
+
 def schedules(season):
  """
  Pull retrosheet schedule for a given season
  """
- GH_TOKEN=os.getenv('GH_TOKEN', '')
+ GH_TOKEN = os.getenv('GH_TOKEN', '')
  # validate input
  g = Github(GH_TOKEN)
  repo = g.get_repo('chadwickbureau/retrosheet')
- schedules = [f.path[f.path.rfind('/')+1:] for f in repo.get_contents('schedule')]
+ schedules = [f.path[f.path.rfind('/') + 1:] for f in repo.get_contents('schedule')]
  file_name = f'{season}SKED.TXT'
 
  if file_name not in schedules:
@@ -231,15 +237,16 @@ def schedules(season):
  data.columns = schedule_columns
  return data
 
+
 def season_game_logs(season):
  """
  Pull Retrosheet game logs for a given season
  """
- GH_TOKEN=os.getenv('GH_TOKEN', '')
+ GH_TOKEN = os.getenv('GH_TOKEN', '')
  # validate input
  g = Github(GH_TOKEN)
  repo = g.get_repo('chadwickbureau/retrosheet')
- gamelogs = [f.path[f.path.rfind('/')+1:] for f in repo.get_contents('gamelog')]
+ gamelogs = [f.path[f.path.rfind('/') + 1:] for f in repo.get_contents('gamelog')]
  file_name = f'GL{season}.TXT'
 
  if file_name not in gamelogs:

diff --git a/pybaseball/statcast.py b/pybaseball/statcast.py
@@ -1,5 +1,5 @@
 import concurrent.futures
-import warnings
+import logging
 from datetime import date
 from typing import Optional, Union
 
@@ -11,14 +11,19 @@
 from . import cache
 from .utils import sanitize_date_range, statcast_date_range
 
+logger = logging.getLogger(__name__)
+
 _SC_SINGLE_GAME_REQUEST = "/statcast_search/csv?all=true&type=details&game_pk={game_pk}"
 # pylint: disable=line-too-long
 _SC_SMALL_REQUEST = "/statcast_search/csv?all=true&hfPT=&hfAB=&hfBBT=&hfPR=&hfZ=&stadium=&hfBBL=&hfNewZones=&hfGT=R%7CPO%7CS%7C=&hfSea=&hfSit=&player_type=pitcher&hfOuts=&opponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt={start_dt}&game_date_lt={end_dt}&team={team}&position=&hfRO=&home_road=&hfFlag=&metric_1=&hfInn=&min_pitches=0&min_results=0&group_by=name&sort_col=pitches&player_event_sort=h_launch_speed&sort_order=desc&min_abs=0&type=details&"
+
+
 # _MAX_SC_RESULTS = 40000
 
 class StatcastException(Exception):
  pass
 
+
 @cache.df_cache(expires=365)
 def _small_request(start_dt: date, end_dt: date, team: Optional[str] = None) -> pd.DataFrame:
  data = statcast_ds.get_statcast_data_from_csv_url(
@@ -47,7 +52,7 @@ def _small_request(start_dt: date, end_dt: date, team: Optional[str] = None) ->
 
 def _check_warning(start_dt: date, end_dt: date) -> None:
  if not cache.config.enabled and (end_dt - start_dt).days >= 42:
- warnings.warn(_OVERSIZE_WARNING)
+ logger.warning(_OVERSIZE_WARNING)
 
 
 def _handle_request(start_dt: date, end_dt: date, step: int, verbose: bool,
@@ -59,7 +64,7 @@ def _handle_request(start_dt: date, end_dt: date, step: int, verbose: bool,
  _check_warning(start_dt, end_dt)
 
  if verbose:
- print("This is a large query, it may take a moment to complete", flush=True)
+ logger.info("This is a large query, it may take a moment to complete")
 
  dataframe_list = []
  date_range = list(statcast_date_range(start_dt, end_dt, step, verbose))
@@ -71,7 +76,7 @@ def _handle_request(start_dt: date, end_dt: date, step: int, verbose: bool,
  # See https://docs.python.org/3.7/library/concurrent.futures.html#processpoolexecutor
  with concurrent.futures.ThreadPoolExecutor() as executor:
  futures = {executor.submit(_small_request, subq_start, subq_end, team=team)
- for subq_start, subq_end in date_range}
+  for subq_start, subq_end in date_range}
  for future in concurrent.futures.as_completed(futures):
  dataframe_list.append(future.result())
  progress.update(1)
@@ -114,7 +119,7 @@ def statcast(start_dt: str = None, end_dt: str = None, team: str = None,
  team=team, parallel=parallel)
 
 
-def statcast_single_game(game_pk: Union[str, int]) -> pd.DataFrame:
+def statcast_single_game(game_pk: Union[str, int]) -> Optional[pd.DataFrame]:
  """
  Pulls statcast play-level data from Baseball Savant for a single game,
  identified by its MLB game ID (game_pk in statcast data)

diff --git a/pybaseball/team_batting.py b/pybaseball/team_batting.py
@@ -1,4 +1,5 @@
 from typing import List, Optional
+import logging
 
 import pandas as pd
 from bs4 import BeautifulSoup
@@ -35,7 +36,7 @@ def team_batting_bref(team: str, start_season: int, end_season: Optional[int]=No
  raw_data = []
  headings: Optional[List[str]] = None
  for season in range(start_season, end_season+1):
- print("Getting Batting Data: {} {}".format(season, team))
+ logging.info("Getting Batting Data: %s %s",season, team)
  stats_url = "{}/{}.shtml".format(url, season)
  response = session.get(stats_url)
  soup = BeautifulSoup(response.content, 'html.parser')