Files
baseball-nn/data/stats_importer.py

278 lines
14 KiB
Python

import os
import csv
import time
import shutil
from math import *
from data.db_connect import Database
from data.build_weather import get_weather, get_sun_and_moon_phase
class Importer:
def __init__(self, database: Database):
self.database = database
def parse_all_data(self, source_dir, dest_dir):
# Ensure the destination directory exists
if not os.path.exists(dest_dir):
os.makedirs(dest_dir)
# List all files in the source and destination directories
source_files = set(os.listdir(source_dir))
for filename in os.listdir(dest_dir):
if filename.endswith('.csv'):
source_files.discard(filename)
dest_files = set(os.listdir(dest_dir))
# Find files that are in the source but not in the destination
missing_files = source_files - dest_files
# Copy any missing CSV files from the source directory to the destination directory
for filename in missing_files:
src_file = os.path.join(source_dir, filename)
dest_file = os.path.join(dest_dir, filename)
if self.parse_one_file(f"{source_dir}/{filename}"):
try:
shutil.copy(src_file, dest_file)
print(f"Copied {filename} to {dest_dir}")
except Exception as e:
print(f"Failed to copy {filename}: {e}")
def parse_one_file(self, filepath):
bb_dict = {}
with open(filepath, 'r') as bb_data_file:
reader = csv.DictReader(bb_data_file)
bb_dict = list(reader)
count = 0
for game in bb_dict:
# Delay to not overwhelm the free api
count += 1
print(f"Current line {count}")
if count % 600 == 0:
print("Sleeping for 1 min")
time.sleep(60)
if count % 5000 == 0:
print("Sleeping for 1 hour")
time.sleep(60*60)
if count % 10000 == 0:
print("Sleeping for 1 day")
time.sleep(60*60*24)
if not self.populate_database_with_stats(game):
return False
return True
def populate_database_with_stats(self, game_stats) -> bool:
parkid = game_stats["park-id"]
park_data = self.database.select("SELECT latitude, longitude FROM parks WHERE park_id = ?", (game_stats["park-id"],))
if park_data is None:
print(f"{parkid} is None")
return True
check_game_added_query = "SELECT id FROM games WHERE game_date = ? AND game_number = ? AND park_id = ?"
check_game_added_data = [game_stats["date"], game_stats["num-of-game"], game_stats['park-id']]
if self.database.select(check_game_added_query, check_game_added_data) is not None:
return True
insert_game = """
INSERT INTO games
(
game_date, game_number, day_of_week,
length_in_outs, day_night, completion_info,
forfeit, protest, park_id,
attendence, length_in_minutes, home_plate_ump_id,
home_plate_ump_name, b1_ump_id, b1_ump_name,
b2_ump_id, b2_ump_name, b3_ump_id,
b3_ump_name, lf_ump_id, lf_ump_name,
rf_ump_id, rf_ump_name
)
VALUES
(
?, ?, ?,
?, ?, ?,
?, ?, ?,
?, ?, ?,
?, ?, ?,
?, ?, ?,
?, ?, ?,
?, ?
)
"""
insert_team_game = """
INSERT INTO team_game
(
game, team, game_num,
score, line_score, win, home, at_bats,
hits, doubles, triples,
homeruns, rbis, sacrifice_hits,
sacrifice_flies, hit_by_pitch, walks,
intentional_walks, strikeouts, stolen_bases,
caught_stealing, grounded_double, interference,
left_on_base, pitchers_used, individual_earned_runs,
earned_runs, wild_pitches, balks,
putouts, assists, errors,
passed, double_play, triple_play,
manager_id, manager_name, starting_pitcher_id,
starting_pitcher_name, starting_1_id, starting_1_name,
starting_1_position, starting_2_id, starting_2_name,
starting_2_position, starting_3_id, starting_3_name,
starting_3_position, starting_4_id, starting_4_name,
starting_4_position, starting_5_id, starting_5_name,
starting_5_position, starting_6_id, starting_6_name,
starting_6_position, starting_7_id, starting_7_name,
starting_7_position, starting_8_id, starting_8_name,
starting_8_position, starting_9_id, starting_9_name,
starting_9_position
)
VALUES
(
?, ?, ?,
?, ?, ?, ?, ?,
?, ?, ?,
?, ?, ?,
?, ?, ?,
?, ?, ?,
?, ?, ?,
?, ?, ?,
?, ?, ?,
?, ?, ?,
?, ?, ?,
?, ?, ?,
?, ?, ?,
?, ?, ?,
?, ?, ?,
?, ?, ?,
?, ?, ?,
?, ?, ?,
?, ?, ?,
?, ?, ?,
?, ?, ?,
?
)
"""
insert_into_weather = """
INSERT INTO weather
(
game_id, temperature, humidity,
dew_point, apparent_temperature, air_pressure,
precipitation, rain, snowfall,
cloud_cover, wind_speed, wind_direction,
wind_gusts, sun_rise, sun_set,
moon_phase
)
VALUES
(
?, ?, ?,
?, ?, ?,
?, ?, ?,
?, ?, ?,
?, ?, ?,
?
)
"""
hour = 15 if game_stats["day-night"] == "D" else 19
historic_weather = get_weather(park_data[0], park_data[1], game_stats["date"], hour)
if "error" in historic_weather:
print(f"Error: {historic_weather['error']}: Details: {historic_weather['details']}")
if "No weather data available" in historic_weather['details']:
historic_weather = None
else:
return False
elif "hourly" not in historic_weather:
print(f"Failed to get weather: Full JSON: {historic_weather}")
historic_weather = None
else:
historic_weather = historic_weather["hourly"]
game_data = [
game_stats["date"], game_stats["num-of-game"], game_stats["day-of-week"],
game_stats["length-in-outs"], game_stats["day-night"], game_stats["completion-info"],
game_stats["forfeit"], game_stats["protest"], game_stats["park-id"],
game_stats["attendance"], game_stats["length-in-min"], game_stats["home-plate-ump-id"],
game_stats["home-plate-ump-name"], game_stats["1b-plate-ump-id"], game_stats["1b-plate-ump-name"],
game_stats["2b-plate-ump-id"], game_stats["2b-plate-ump-name"], game_stats["3b-plate-ump-id"],
game_stats["3b-plate-ump-name"], game_stats["lf-plate-ump-id"], game_stats["lf-plate-ump-name"],
game_stats["rf-plate-ump-id"], game_stats["rf-plate-ump-name"],
]
game_id = self.database.insert(insert_game, game_data)
visiting_win_loss = int(game_stats["visiting-score"]) > int(game_stats["home-score"])
visiting_team_data = [
game_id, game_stats["visiting-team"], game_stats["visiting-game-num"],
game_stats["visiting-score"], game_stats["visiting-line-scores"], visiting_win_loss, 0, game_stats["visiting-at-bats"],
game_stats["visiting-hits"], game_stats["visiting-doubles"], game_stats["visiting-triples"],
game_stats["visiting-homeruns"], game_stats["visiting-rbi"], game_stats["visiting-sacrifice-hits"],
game_stats["visiting-sacrifice-flies"], game_stats["visiting-hit-by-pitch"], game_stats["visiting-walks"],
game_stats["visiting-intentional-walks"], game_stats["visiting-strikeouts"], game_stats["visiting-stolen-bases"],
game_stats["visiting-caught-stealing"], game_stats["visiting-grounded-double"], game_stats["visiting-interference"],
game_stats["visiting-left-on-base"], game_stats["visiting-pitchers-used"], game_stats["visiting-individual-earned-runs"],
game_stats["visiting-team-earned-runs"], game_stats["visiting-wild-pitches"], game_stats["visiting-balks"],
game_stats["visiting-putouts"], game_stats["visiting-assists"], game_stats["visiting-errors"],
game_stats["visiting-passed"], game_stats["visiting-double-play"], game_stats["visiting-triple-play"],
game_stats["visiting-manager-id"], game_stats["visiting-manager-name"], game_stats["visiting-start-pitcher-id"],
game_stats["visiting-start-pitcher-name"], game_stats["visiting-starting-1-id"], game_stats["visiting-starting-1-name"],
game_stats["visiting-starting-1-position"], game_stats["visiting-starting-2-id"], game_stats["visiting-starting-2-name"],
game_stats["visiting-starting-2-position"], game_stats["visiting-starting-3-id"], game_stats["visiting-starting-3-name"],
game_stats["visiting-starting-3-position"], game_stats["visiting-starting-4-id"], game_stats["visiting-starting-4-name"],
game_stats["visiting-starting-4-position"], game_stats["visiting-starting-5-id"], game_stats["visiting-starting-5-name"],
game_stats["visiting-starting-5-position"], game_stats["visiting-starting-6-id"], game_stats["visiting-starting-6-name"],
game_stats["visiting-starting-6-position"], game_stats["visiting-starting-7-id"], game_stats["visiting-starting-7-name"],
game_stats["visiting-starting-7-position"], game_stats["visiting-starting-8-id"], game_stats["visiting-starting-8-name"],
game_stats["visiting-starting-8-position"], game_stats["visiting-starting-9-id"], game_stats["visiting-starting-9-name"],
game_stats["visiting-starting-9-position"]
]
home_win_loss = int(game_stats["home-score"]) > int(game_stats["visiting-score"])
home_team_data = [
game_id, game_stats["home-team"], game_stats["home-game-num"],
game_stats["home-score"], game_stats["home-line-scores"], home_win_loss, 1, game_stats["home-at-bats"],
game_stats["home-hits"], game_stats["home-doubles"], game_stats["home-triples"],
game_stats["home-homeruns"], game_stats["home-rbi"], game_stats["home-sacrifice-hits"],
game_stats["home-sacrifice-flies"], game_stats["home-hit-by-pitch"], game_stats["home-walks"],
game_stats["home-intentional-walks"], game_stats["home-strikeouts"], game_stats["home-stolen-bases"],
game_stats["home-caught-stealing"], game_stats["home-grounded-double"], game_stats["home-interference"],
game_stats["home-left-on-base"], game_stats["home-pitchers-used"], game_stats["home-individual-earned-runs"],
game_stats["home-team-earned-runs"], game_stats["home-wild-pitches"], game_stats["home-balks"],
game_stats["home-putouts"], game_stats["home-assists"], game_stats["home-errors"],
game_stats["home-passed"], game_stats["home-double-play"], game_stats["home-triple-play"],
game_stats["home-manager-id"], game_stats["home-manager-name"], game_stats["home-start-pitcher-id"],
game_stats["home-start-pitcher-name"], game_stats["home-starting-1-id"], game_stats["home-starting-1-name"],
game_stats["home-starting-1-position"], game_stats["home-starting-2-id"], game_stats["home-starting-2-name"],
game_stats["home-starting-2-position"], game_stats["home-starting-3-id"], game_stats["home-starting-3-name"],
game_stats["home-starting-3-position"], game_stats["home-starting-4-id"], game_stats["home-starting-4-name"],
game_stats["home-starting-4-position"], game_stats["home-starting-5-id"], game_stats["home-starting-5-name"],
game_stats["home-starting-5-position"], game_stats["home-starting-6-id"], game_stats["home-starting-6-name"],
game_stats["home-starting-6-position"], game_stats["home-starting-7-id"], game_stats["home-starting-7-name"],
game_stats["home-starting-7-position"], game_stats["home-starting-8-id"], game_stats["home-starting-8-name"],
game_stats["home-starting-8-position"], game_stats["home-starting-9-id"], game_stats["home-starting-9-name"],
game_stats["home-starting-9-position"]
]
self.database.insert(insert_team_game, visiting_team_data)
self.database.insert(insert_team_game, home_team_data)
if historic_weather is not None:
(sunrise_time, sunset_time, moonphase) = get_sun_and_moon_phase(park_data[0], park_data[1], game_stats["date"])
weather_data = [
game_id, historic_weather["temperature_2m"][hour], historic_weather["relative_humidity_2m"][hour],
historic_weather["dew_point_2m"][hour], historic_weather["apparent_temperature"][hour], historic_weather["pressure_msl"][hour],
historic_weather["precipitation"][hour], historic_weather["rain"][hour], historic_weather["snowfall"][hour],
historic_weather["cloud_cover"][hour], historic_weather["wind_speed_10m"][hour], historic_weather["wind_direction_10m"][hour],
historic_weather["wind_gusts_10m"][hour], sunrise_time, sunset_time,
moonphase,
]
self.database.insert(insert_into_weather, weather_data)
return True