Moving around csvs and better handling of duplicate data. Pulling all relevant data

This commit is contained in:
2025-09-15 18:46:37 +00:00
parent 3950e99151
commit 125d133af5
9 changed files with 2450 additions and 33 deletions

1
.gitignore vendored
View File

@@ -2,5 +2,4 @@
.Trash-1000
target
database
data/stats/imported
data/__pycache__

View File

@@ -23,10 +23,15 @@ class Database:
else:
return None
def selectall(self, query, values):
def selectall(self, query, values = None):
# Query the database for the specified index
cursor = self.db.cursor()
if values is None:
cursor.execute(query)
else:
cursor.execute(query, values)
result = cursor.fetchall()
if result:
return result

View File

@@ -253,8 +253,6 @@ def pull_training_data(database: Database, game_date: str, game_number: int, par
print(f"Failed to get game data for date: {game_date}, number: {game_number}, park: {park_id}")
return None
print(curr_game)
select_teams = """
SELECT
win, team, home,
@@ -400,6 +398,4 @@ def pull_training_data(database: Database, game_date: str, game_number: int, par
]
training_data = [*training_data, *prev_game_data]
print(f"{training_result}\n{training_data}")
return (training_result, training_data)

View File

Can't render this file because it is too large.

View File

Can't render this file because it is too large.

View File

Can't render this file because it is too large.

2430
data/stats/gl2024.csv Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -11,33 +11,17 @@ class Importer:
def __init__(self, database: Database):
self.database = database
def parse_all_data(self, source_dir, dest_dir):
# Ensure the destination directory exists
if not os.path.exists(dest_dir):
os.makedirs(dest_dir)
def parse_all_data(self, source_dir):
# List all files in the source and destination directories
source_files = set(os.listdir(source_dir))
for filename in os.listdir(dest_dir):
if filename.endswith('.csv'):
source_files.discard(filename)
dest_files = set(os.listdir(dest_dir))
# Find files that are in the source but not in the destination
missing_files = source_files - dest_files
# Copy any missing CSV files from the source directory to the destination directory
for filename in missing_files:
for filename in source_files:
src_file = os.path.join(source_dir, filename)
dest_file = os.path.join(dest_dir, filename)
if self.parse_one_file(f"{source_dir}/{filename}"):
try:
shutil.copy(src_file, dest_file)
print(f"Copied {filename} to {dest_dir}")
except Exception as e:
print(f"Failed to copy {filename}: {e}")
if not self.parse_one_file(f"{source_dir}/{filename}"):
print(f"Failed to parse {source_dir}/{filename}")
def parse_one_file(self, filepath):
bb_dict = {}
@@ -61,6 +45,7 @@ class Importer:
time.sleep(60*60*24)
if not self.populate_database_with_stats(game):
print(f"Failed to parse and populate {game}")
return False
return True

12
main.py
View File

@@ -17,12 +17,14 @@ inputs = np.array([[0, 0, 1, 0],
outputs = np.array([[0], [0], [0], [1], [1], [1]])
if __name__ == '__main__':
#db_file = "./database/baseball.db"
#db_conn = Database(db_file)
db_file = "./database/baseball.db"
db_conn = Database(db_file)
#pull_training_data(db_conn, "20240602", 0, "BAL12")
query = "SELECT game_date, game_number, park_id FROM games ORDER BY game_date"
all_games = db_conn.selectall(query)
print(get_sun_and_moon_phase(39.283889, -76.621667, "20240602"))
for game in all_games:
game_result, training_data = pull_training_data(db_conn, str(game[0]), game[1], game[2])
"""
build_db_path = "./data/sql/build_db.sql"
@@ -34,7 +36,7 @@ if __name__ == '__main__':
db_conn.run_sql_file(fill_teams_path)
imp = Importer(db_conn)
imp.parse_all_data("./data/stats/to_import", "./data/stats/imported/")
imp.parse_all_data("./data/stats/", "./data/stats/imported/")
"""
"""