Files
baseball-nn/data/get_data.py

402 lines
13 KiB
Python

from data.db_connect import Database
# Game date in YYYYmmDD
def pull_training_data(database: Database, game_date: str, game_number: int, park_id: str):
# Training data
"""
- park_id
- lf_fence_distance
- lf_fence_height
- ct_fence_distance
- ct_fence_height
- rf_fence_distance
- rf_fence_height
- has_roof
- elevation
- game_date
- game_number
- day_of_week
- day_night
- home_plate_ump_id
- b1_ump_id
- b2_ump_id
- b3_ump_id
- lf_ump_id
- rf_ump_id
- temperature
- humidity
- dew_point
- apparent_temperature
- air_pressure
- precipitation
- rain
- snowfall
- cloud_cover
- wind_speed
- wind_direction
- wind_gusts
- sun_rise
- sun_set
- moon_phase
- Team being trained
- game_num
- manager_id
- starting_pitcher_id
- starting_1_id
- starting_1_position
- starting_2_id
- starting_2_position
- starting_3_id
- starting_3_position
- starting_4_id
- starting_4_position
- starting_5_id
- starting_5_position
- starting_6_id
- starting_6_position
- starting_7_id
- starting_7_position
- starting_8_id
- starting_8_position
- starting_9_id
- starting_9_position
- Calculated values from previous games
- win_streak
- loss_streak
- game_date
- day_of_weeki
- length_in_outs
- day_night
- completion_info
- forfeit
- protest
- park_id
- attendence
- length_in_minutes
- hits
- doubles
- triples
- homeruns
- rbis
- sacrifice_hits
- sacrifice_flies
- hit_by_pitch
- walks
- intentional_walks
- strikeouts
- stolen_bases
- caught_stealing
- grounded_double
- interference
- left_on_base
- pitchers_used
- individual_earned_runs
- earned_runs
- wild_pitches
- balks
- putouts
- assists
- errors
- passed
- double_play
- triple_play
- game_num
- manager_id
- starting_pitcher_id
- starting_1_id
- starting_1_position
- starting_2_id
- starting_2_position
- starting_3_id
- starting_3_position
- starting_4_id
- starting_4_position
- starting_5_id
- starting_5_position
- starting_6_id
- starting_6_position
- starting_7_id
- starting_7_position
- starting_8_id
- starting_8_position
- starting_9_id
- starting_9_position
- Opposing team
- game_num
- manager_id
- starting_pitcher_id
- starting_1_id
- starting_1_position
- starting_2_id
- starting_2_position
- starting_3_id
- starting_3_position
- starting_4_id
- starting_4_position
- starting_5_id
- starting_5_position
- starting_6_id
- starting_6_position
- starting_7_id
- starting_7_position
- starting_8_id
- starting_8_position
- starting_9_id
- starting_9_position
- Calculated values from previous games
- win_streak
- loss_streak
- game_date
- day_of_weeki
- length_in_outs
- day_night
- completion_info
- forfeit
- protest
- park_id
- attendence
- length_in_minutes
- hits
- doubles
- triples
- homeruns
- rbis
- sacrifice_hits
- sacrifice_flies
- hit_by_pitch
- walks
- intentional_walks
- strikeouts
- stolen_bases
- caught_stealing
- grounded_double
- interference
- left_on_base
- pitchers_used
- individual_earned_runs
- earned_runs
- wild_pitches
- balks
- putouts
- assists
- errors
- passed
- double_play
- triple_play
- game_num
- manager_id
- starting_pitcher_id
- starting_1_id
- starting_1_position
- starting_2_id
- starting_2_position
- starting_3_id
- starting_3_position
- starting_4_id
- starting_4_position
- starting_5_id
- starting_5_position
- starting_6_id
- starting_6_position
- starting_7_id
- starting_7_position
- starting_8_id
- starting_8_position
- starting_9_id
- starting_9_position
- Output
- 1 home team wins, 0 visiting team wins
"""
select_upcoming_game = """
SELECT
games.id,
games.game_date, games.game_number,
games.day_of_week, games.day_night,
games.home_plate_ump_id,
games.b1_ump_id, games.b2_ump_id, games.b3_ump_id,
games.lf_ump_id, games.rf_ump_id,
parks.park_id,
parks.lf_fence_distance, parks.lf_fence_height,
parks.ct_fence_distance, parks.ct_fence_height,
parks.rf_fence_distance, parks.rf_fence_height,
parks.has_roof, parks.elevation,
parks.latitude, parks.longitude,
weather.temperature, weather.humidity, weather.dew_point,
weather.apparent_temperature, weather.air_pressure,
weather.precipitation, weather.rain, weather.snowfall,
weather.cloud_cover,
weather.wind_speed, weather.wind_direction, weather.wind_gusts,
weather.sun_rise, weather.sun_set, weather.moon_phase
FROM
games
LEFT JOIN parks ON parks.park_id = games.park_id
LEFT JOIN weather ON weather.game_id = games.id
WHERE
games.game_date = ? AND games.game_number = ? AND games.park_id = ?
"""
curr_game = database.select(select_upcoming_game, [game_date, game_number, park_id])
if curr_game is None:
print(f"Failed to get game data for date: {game_date}, number: {game_number}, park: {park_id}")
return None
select_teams = """
SELECT
win, team, home,
game_num, manager_id, starting_pitcher_id,
starting_1_id, starting_1_position,
starting_2_id, starting_2_position,
starting_3_id, starting_3_position,
starting_4_id, starting_4_position,
starting_5_id, starting_5_position,
starting_6_id, starting_6_position,
starting_7_id, starting_7_position,
starting_8_id, starting_8_position,
starting_9_id, starting_9_position
FROM
team_game
WHERE
game = ?
"""
curr_team_game = database.selectall(select_teams, [curr_game[0]])
if len(curr_team_game) != 2:
print(f"Got the wrong number of games {len(curr_team_game)}")
return None
training_result = (curr_team_game[0][0] == 1 and curr_team_game[0][2] == 1)
select_previous_games = """
SELECT
games.id,
games.game_date, games.game_number,
games.day_of_week, games.day_night,
games.length_in_outs, games.completion_info,
games.forfeit, games.protest,
games.attendence, games.length_in_minutes,
games.home_plate_ump_id,
games.b1_ump_id, games.b2_ump_id, games.b3_ump_id,
games.lf_ump_id, games.rf_ump_id,
team_game.game_num, team_game.score,
team_game.line_score, team_game.win,
team_game.hits, team_game.doubles, team_game.triples,
team_game.homeruns, team_game.rbis,
team_game.sacrifice_hits, team_game.sacrifice_flies,
team_game.hit_by_pitch, team_game.walks, team_game.intentional_walks,
team_game.strikeouts, team_game.stolen_bases,
team_game.caught_stealing, team_game.grounded_double,
team_game.interference, team_game.left_on_base,
team_game.pitchers_used,
team_game.individual_earned_runs, team_game.earned_runs,
team_game.wild_pitches, team_game.balks,
team_game.putouts, team_game.assists,
team_game.errors, team_game.passed,
team_game.double_play, team_game.triple_play,
team_game.manager_id, team_game.starting_pitcher_id,
team_game.starting_1_id, team_game.starting_1_position,
team_game.starting_2_id, team_game.starting_2_position,
team_game.starting_3_id, team_game.starting_3_position,
team_game.starting_4_id, team_game.starting_4_position,
team_game.starting_5_id, team_game.starting_5_position,
team_game.starting_6_id, team_game.starting_6_position,
team_game.starting_7_id, team_game.starting_7_position,
team_game.starting_8_id, team_game.starting_8_position,
team_game.starting_9_id, team_game.starting_9_position,
parks.park_id,
weather.temperature, weather.humidity, weather.dew_point,
weather.apparent_temperature, weather.air_pressure,
weather.precipitation, weather.rain, weather.snowfall,
weather.cloud_cover,
weather.wind_speed, weather.wind_direction, weather.wind_gusts,
weather.sun_rise, weather.sun_set, weather.moon_phase
FROM
games
LEFT JOIN parks ON parks.park_id = games.park_id
LEFT JOIN team_game ON team_game.game = games.id
LEFT JOIN weather ON weather.game_id = games.id
WHERE
games.game_date > ? AND
games.game_date < ? AND
team_game.team = ?
ORDER BY team_game.game_num ASC
"""
first_of_the_year = f"{game_date[:4]}0101"
select_previous_games_data_0 = [first_of_the_year, game_date, curr_team_game[0][1]]
select_previous_games_data_1 = [first_of_the_year, game_date, curr_team_game[1][1]]
training_data = [*curr_game[1:]]
training_data = [*training_data, *curr_team_game[0][1:]]
prev_games = database.selectall(select_previous_games, select_previous_games_data_0)
prev_game_data = [0] * 61
if prev_games is not None:
prev_win_streak = 0
index = len(prev_games) - 1
while index > 0 and prev_games[index][20] == 1:
prev_win_streak += 1
index -= 1
prev_loss_streak = 0
index = len(prev_games) - 1
while index > 0 and prev_games[index][20] == 0:
prev_loss_streak += 1
index -= 1
index = len(prev_games) - 1
prev_game_data = [
prev_win_streak,
prev_loss_streak,
*prev_games[index][1:]
]
training_data = [*training_data, *prev_game_data]
training_data = [*training_data, *curr_team_game[1][1:]]
prev_games = database.selectall(select_previous_games, select_previous_games_data_1)
prev_game_data = [0] * 61
if prev_games is not None:
prev_win_streak = 0
index = len(prev_games) - 1
while index > 0 and prev_games[index][20] == 1:
prev_win_streak += 1
index -= 1
prev_loss_streak = 0
index = len(prev_games) - 1
while index > 0 and prev_games[index][20] == 0:
prev_loss_streak += 1
index -= 1
index = len(prev_games) - 1
prev_game_data = [
prev_win_streak,
prev_loss_streak,
*prev_games[index][1:]
]
training_data = [*training_data, *prev_game_data]
return (training_result, training_data)