Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Logging improvements #388

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 21 additions & 18 deletions pybaseball/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,29 +7,29 @@
from .teamid_lookup import team_ids
from .statcast import statcast, statcast_single_game
from .statcast_pitcher import (
statcast_pitcher,
statcast_pitcher_exitvelo_barrels,
statcast_pitcher_expected_stats,
statcast_pitcher_pitch_arsenal,
statcast_pitcher_arsenal_stats,
statcast_pitcher_percentile_ranks,
statcast_pitcher_spin_dir_comp
statcast_pitcher,
statcast_pitcher_exitvelo_barrels,
statcast_pitcher_expected_stats,
statcast_pitcher_pitch_arsenal,
statcast_pitcher_arsenal_stats,
statcast_pitcher_percentile_ranks,
statcast_pitcher_spin_dir_comp
)
from .statcast_batter import (
statcast_batter,
statcast_batter_exitvelo_barrels,
statcast_batter_expected_stats,
statcast_batter_percentile_ranks,
statcast_batter_pitch_arsenal
statcast_batter,
statcast_batter_exitvelo_barrels,
statcast_batter_expected_stats,
statcast_batter_percentile_ranks,
statcast_batter_pitch_arsenal
)
from .statcast_running import statcast_sprint_speed, statcast_running_splits
from .statcast_fielding import (
statcast_outs_above_average,
statcast_outfield_directional_oaa,
statcast_outfield_catch_prob,
statcast_outfielder_jump,
statcast_catcher_poptime,
statcast_catcher_framing
statcast_outs_above_average,
statcast_outfield_directional_oaa,
statcast_outfield_catch_prob,
statcast_outfielder_jump,
statcast_catcher_poptime,
statcast_catcher_framing
)
from .league_batting_stats import batting_stats_bref
from .league_batting_stats import batting_stats_range
Expand Down Expand Up @@ -100,3 +100,6 @@
fg_team_pitching_data)
from .split_stats import get_splits
from .version import __version__
import logging

package_logger = logging.getLogger(__name__)
42 changes: 23 additions & 19 deletions pybaseball/league_pitching_stats.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from datetime import date
import logging
import io
from typing import Optional, Union

Expand All @@ -11,14 +12,16 @@
from .datasources.bref import BRefSession

session = BRefSession()
logger = logging.getLogger(__name__)


def get_soup(start_dt: Optional[Union[date, str]], end_dt: Optional[Union[date, str]]) -> BeautifulSoup:
# get most recent standings if date not specified
if((start_dt is None) or (end_dt is None)):
print('Error: a date range needs to be specified')
if ((start_dt is None) or (end_dt is None)):
logger.error('a date range needs to be specified')
return None
url = "http://www.baseball-reference.com/leagues/daily.cgi?user_team=&bust_cache=&type=p&lastndays=7&dates=fromandto&fromandto={}.{}&level=mlb&franch=&stat=&stat_value=0".format(start_dt, end_dt)
url = "http://www.baseball-reference.com/leagues/daily.cgi?user_team=&bust_cache=&type=p&lastndays=7&dates=fromandto&fromandto={}.{}&level=mlb&franch=&stat=&stat_value=0".format(
start_dt, end_dt)
s = session.get(url).content
# a workaround to avoid beautiful soup applying the wrong encoding
s = str(s).encode()
Expand Down Expand Up @@ -47,7 +50,7 @@ def get_table(soup: BeautifulSoup) -> pd.DataFrame:


@cache.df_cache()
def pitching_stats_range(start_dt: Optional[str]=None, end_dt: Optional[str]=None) -> pd.DataFrame:
def pitching_stats_range(start_dt: Optional[str] = None, end_dt: Optional[str] = None) -> pd.DataFrame:
"""
Get all pitching stats for a set time range. This can be the past week, the
month of August, anything. Just supply the start and end date in YYYY-MM-DD
Expand All @@ -62,36 +65,37 @@ def pitching_stats_range(start_dt: Optional[str]=None, end_dt: Optional[str]=Non
# retrieve html from baseball reference
soup = get_soup(start_dt_date, end_dt_date)
table = get_table(soup)
table = table.dropna(how='all') # drop if all columns are NA
#fix some strange formatting for percentage columns
table = table.dropna(how='all') # drop if all columns are NA
# fix some strange formatting for percentage columns
table = table.replace('---%', np.nan)
#make sure these are all numeric
# make sure these are all numeric
for column in ['Age', '#days', 'G', 'GS', 'W', 'L', 'SV', 'IP', 'H',
'R', 'ER', 'BB', 'SO', 'HR', 'HBP', 'ERA', 'AB', '2B',
'3B', 'IBB', 'GDP', 'SF', 'SB', 'CS', 'PO', 'BF', 'Pit',
'WHIP', 'BAbip', 'SO9', 'SO/W']:
'R', 'ER', 'BB', 'SO', 'HR', 'HBP', 'ERA', 'AB', '2B',
'3B', 'IBB', 'GDP', 'SF', 'SB', 'CS', 'PO', 'BF', 'Pit',
'WHIP', 'BAbip', 'SO9', 'SO/W']:
table[column] = pd.to_numeric(table[column])
#convert str(xx%) values to float(0.XX) decimal values
# convert str(xx%) values to float(0.XX) decimal values
for column in ['Str', 'StL', 'StS', 'GB/FB', 'LD', 'PU']:
table[column] = table[column].replace('%','',regex=True).astype('float')/100
table[column] = table[column].replace('%', '', regex=True).astype('float') / 100

table = table.drop('', axis=1)
return table

def pitching_stats_bref(season: Optional[int]=None) -> pd.DataFrame:

def pitching_stats_bref(season: Optional[int] = None) -> pd.DataFrame:
"""
Get all pitching stats for a set season. If no argument is supplied, gives stats for
current season to date.
"""
if season is None:
season = most_recent_season()
str_season = str(season)
start_dt = str_season + '-03-01' #opening day is always late march or early april
end_dt = str_season + '-11-30' #postseason is definitely over by end of November
return(pitching_stats_range(start_dt, end_dt))
start_dt = str_season + '-03-01' # opening day is always late march or early april
end_dt = str_season + '-11-30' # postseason is definitely over by end of November
return (pitching_stats_range(start_dt, end_dt))


def bwar_pitch(return_all: bool=False) -> pd.DataFrame:
def bwar_pitch(return_all: bool = False) -> pd.DataFrame:
"""
Get data from war_daily_pitch table. Returns WAR, its components, and a few other useful stats.
To get all fields from this table, supply argument return_all=True.
Expand All @@ -103,6 +107,6 @@ def bwar_pitch(return_all: bool=False) -> pd.DataFrame:
return c
else:
cols_to_keep = ['name_common', 'mlb_ID', 'player_ID', 'year_ID', 'team_ID', 'stint_ID', 'lg_ID',
'G', 'GS', 'RA','xRA', 'BIP', 'BIP_perc','salary', 'ERA_plus', 'WAR_rep', 'WAA',
'WAA_adj','WAR']
'G', 'GS', 'RA', 'xRA', 'BIP', 'BIP_perc', 'salary', 'ERA_plus', 'WAR_rep', 'WAA',
'WAA_adj', 'WAR']
return c[cols_to_keep]
6 changes: 4 additions & 2 deletions pybaseball/playerid_lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import re
import zipfile
import logging

from typing import List, Tuple, Iterable

Expand All @@ -15,6 +16,7 @@
PEOPLE_FILE_PATTERN = re.compile("/people.+csv$")

_client = None
logger = logging.getLogger(__name__)


def get_register_file():
Expand Down Expand Up @@ -47,7 +49,7 @@ def chadwick_register(save: bool = False) -> pd.DataFrame:
table = pd.read_csv(get_register_file())
return table

print('Gathering player lookup table. This may take a moment.')
logger.info('Gathering player lookup table. This may take a moment.')
s = requests.get(url).content
mlb_only_cols = ['key_retro', 'key_bbref', 'key_fangraphs', 'mlb_played_first', 'mlb_played_last']
cols_to_keep = ['name_last', 'name_first', 'key_mlbam'] + mlb_only_cols
Expand Down Expand Up @@ -126,7 +128,7 @@ def search(self, last: str, first: str = None, fuzzy: bool = False) -> pd.DataFr

# If no matches, return 5 closest names
if len(results) == 0 and fuzzy:
print("No identically matched names found! Returning the 5 most similar names.")
logger.warning("No identically matched names found! Returning the 5 most similar names.")
results=get_closest_names(last=last, first=first, player_table=self.table)

return results
Expand Down
31 changes: 19 additions & 12 deletions pybaseball/retrosheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,11 @@
discovers discrepancies and we appreciate learning of the details.
"""
import pandas as pd
import logging
from pybaseball.utils import get_text_file
from datetime import datetime
from io import StringIO
from github import Github
import os
from getpass import getuser, getpass
from github.GithubException import RateLimitExceededException
import warnings

Expand Down Expand Up @@ -106,6 +105,9 @@
roster_url = 'https://raw.githubusercontent.com/chadwickbureau/retrosheet/master/rosters/{}{}.ROS'
event_url = 'https://raw.githubusercontent.com/chadwickbureau/retrosheet/master/event/{}/{}'

logger = logging.getLogger(__name__)


def events(season, type='regular', export_dir='.'):
"""
Pulls retrosheet event files for an entire season. The `type` argument
Expand All @@ -115,7 +117,7 @@ def events(season, type='regular', export_dir='.'):
Right now, pybaseball does not parse the retrosheet files but downloads and
saves them.
"""
GH_TOKEN=os.getenv('GH_TOKEN', '')
GH_TOKEN = os.getenv('GH_TOKEN', '')
if not os.path.exists(export_dir):
os.mkdir(export_dir)

Expand All @@ -142,16 +144,17 @@ def events(season, type='regular', export_dir='.'):
)

for filename in event_files:
print(f'Downloading {filename}')
logger.info('Downloading %s', filename)
s = get_text_file(event_url.format(type, filename))
with open(os.path.join(export_dir, filename), 'w') as f:
f.write(s)


def rosters(season):
"""
Pulls retrosheet roster files for an entire season
"""
GH_TOKEN=os.getenv('GH_TOKEN', '')
GH_TOKEN = os.getenv('GH_TOKEN', '')

try:
g = Github(GH_TOKEN)
Expand All @@ -170,15 +173,16 @@ def rosters(season):
UserWarning
)

df_list = [_roster(team = r[:3], season = season, checked=False) for r in rosters]
df_list = [_roster(team=r[:3], season=season, checked=False) for r in rosters]

return pd.concat(df_list)

def _roster(team, season, checked = False):

def _roster(team, season, checked=False):
"""
Pulls retrosheet roster files
"""
GH_TOKEN=os.getenv('GH_TOKEN', '')
GH_TOKEN = os.getenv('GH_TOKEN', '')

if not checked:
g = Github(GH_TOKEN)
Expand All @@ -204,6 +208,7 @@ def _roster(team, season, checked = False):
data.columns = roster_columns
return data


def park_codes():
"""
Pulls retrosheet Park IDs
Expand All @@ -213,15 +218,16 @@ def park_codes():
data.columns = parkcode_columns
return data


def schedules(season):
"""
Pull retrosheet schedule for a given season
"""
GH_TOKEN=os.getenv('GH_TOKEN', '')
GH_TOKEN = os.getenv('GH_TOKEN', '')
# validate input
g = Github(GH_TOKEN)
repo = g.get_repo('chadwickbureau/retrosheet')
schedules = [f.path[f.path.rfind('/')+1:] for f in repo.get_contents('schedule')]
schedules = [f.path[f.path.rfind('/') + 1:] for f in repo.get_contents('schedule')]
file_name = f'{season}SKED.TXT'

if file_name not in schedules:
Expand All @@ -231,15 +237,16 @@ def schedules(season):
data.columns = schedule_columns
return data


def season_game_logs(season):
"""
Pull Retrosheet game logs for a given season
"""
GH_TOKEN=os.getenv('GH_TOKEN', '')
GH_TOKEN = os.getenv('GH_TOKEN', '')
# validate input
g = Github(GH_TOKEN)
repo = g.get_repo('chadwickbureau/retrosheet')
gamelogs = [f.path[f.path.rfind('/')+1:] for f in repo.get_contents('gamelog')]
gamelogs = [f.path[f.path.rfind('/') + 1:] for f in repo.get_contents('gamelog')]
file_name = f'GL{season}.TXT'

if file_name not in gamelogs:
Expand Down
15 changes: 10 additions & 5 deletions pybaseball/statcast.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import concurrent.futures
import warnings
import logging
from datetime import date
from typing import Optional, Union

Expand All @@ -11,14 +11,19 @@
from . import cache
from .utils import sanitize_date_range, statcast_date_range

logger = logging.getLogger(__name__)

_SC_SINGLE_GAME_REQUEST = "/statcast_search/csv?all=true&type=details&game_pk={game_pk}"
# pylint: disable=line-too-long
_SC_SMALL_REQUEST = "/statcast_search/csv?all=true&hfPT=&hfAB=&hfBBT=&hfPR=&hfZ=&stadium=&hfBBL=&hfNewZones=&hfGT=R%7CPO%7CS%7C=&hfSea=&hfSit=&player_type=pitcher&hfOuts=&opponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt={start_dt}&game_date_lt={end_dt}&team={team}&position=&hfRO=&home_road=&hfFlag=&metric_1=&hfInn=&min_pitches=0&min_results=0&group_by=name&sort_col=pitches&player_event_sort=h_launch_speed&sort_order=desc&min_abs=0&type=details&"


# _MAX_SC_RESULTS = 40000

class StatcastException(Exception):
pass


@cache.df_cache(expires=365)
def _small_request(start_dt: date, end_dt: date, team: Optional[str] = None) -> pd.DataFrame:
data = statcast_ds.get_statcast_data_from_csv_url(
Expand Down Expand Up @@ -47,7 +52,7 @@ def _small_request(start_dt: date, end_dt: date, team: Optional[str] = None) ->

def _check_warning(start_dt: date, end_dt: date) -> None:
if not cache.config.enabled and (end_dt - start_dt).days >= 42:
warnings.warn(_OVERSIZE_WARNING)
logger.warning(_OVERSIZE_WARNING)


def _handle_request(start_dt: date, end_dt: date, step: int, verbose: bool,
Expand All @@ -59,7 +64,7 @@ def _handle_request(start_dt: date, end_dt: date, step: int, verbose: bool,
_check_warning(start_dt, end_dt)

if verbose:
print("This is a large query, it may take a moment to complete", flush=True)
logger.info("This is a large query, it may take a moment to complete")

dataframe_list = []
date_range = list(statcast_date_range(start_dt, end_dt, step, verbose))
Expand All @@ -71,7 +76,7 @@ def _handle_request(start_dt: date, end_dt: date, step: int, verbose: bool,
# See https://docs.python.org/3.7/library/concurrent.futures.html#processpoolexecutor
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = {executor.submit(_small_request, subq_start, subq_end, team=team)
for subq_start, subq_end in date_range}
for subq_start, subq_end in date_range}
for future in concurrent.futures.as_completed(futures):
dataframe_list.append(future.result())
progress.update(1)
Expand Down Expand Up @@ -114,7 +119,7 @@ def statcast(start_dt: str = None, end_dt: str = None, team: str = None,
team=team, parallel=parallel)


def statcast_single_game(game_pk: Union[str, int]) -> pd.DataFrame:
def statcast_single_game(game_pk: Union[str, int]) -> Optional[pd.DataFrame]:
"""
Pulls statcast play-level data from Baseball Savant for a single game,
identified by its MLB game ID (game_pk in statcast data)
Expand Down
3 changes: 2 additions & 1 deletion pybaseball/team_batting.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import List, Optional
import logging

import pandas as pd
from bs4 import BeautifulSoup
Expand Down Expand Up @@ -35,7 +36,7 @@ def team_batting_bref(team: str, start_season: int, end_season: Optional[int]=No
raw_data = []
headings: Optional[List[str]] = None
for season in range(start_season, end_season+1):
print("Getting Batting Data: {} {}".format(season, team))
logging.info("Getting Batting Data: %s %s",season, team)
stats_url = "{}/{}.shtml".format(url, season)
response = session.get(stats_url)
soup = BeautifulSoup(response.content, 'html.parser')
Expand Down
Loading