Moving around csvs and better handling of duplicate data. Pulling all relevant data

This commit is contained in:
2025-09-15 18:46:37 +00:00
parent 3950e99151
commit 125d133af5
9 changed files with 2450 additions and 33 deletions

3
.gitignore vendored
View File

@@ -2,5 +2,4 @@
.Trash-1000 .Trash-1000
target target
database database
data/stats/imported data/__pycache__
data/__pycache__

View File

@@ -23,10 +23,15 @@ class Database:
else: else:
return None return None
def selectall(self, query, values): def selectall(self, query, values = None):
# Query the database for the specified index # Query the database for the specified index
cursor = self.db.cursor() cursor = self.db.cursor()
cursor.execute(query, values)
if values is None:
cursor.execute(query)
else:
cursor.execute(query, values)
result = cursor.fetchall() result = cursor.fetchall()
if result: if result:
return result return result

View File

@@ -253,8 +253,6 @@ def pull_training_data(database: Database, game_date: str, game_number: int, par
print(f"Failed to get game data for date: {game_date}, number: {game_number}, park: {park_id}") print(f"Failed to get game data for date: {game_date}, number: {game_number}, park: {park_id}")
return None return None
print(curr_game)
select_teams = """ select_teams = """
SELECT SELECT
win, team, home, win, team, home,
@@ -400,6 +398,4 @@ def pull_training_data(database: Database, game_date: str, game_number: int, par
] ]
training_data = [*training_data, *prev_game_data] training_data = [*training_data, *prev_game_data]
print(f"{training_result}\n{training_data}")
return (training_result, training_data) return (training_result, training_data)

View File

Can't render this file because it is too large.

View File

Can't render this file because it is too large.

View File

Can't render this file because it is too large.

2430
data/stats/gl2024.csv Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -11,33 +11,17 @@ class Importer:
def __init__(self, database: Database): def __init__(self, database: Database):
self.database = database self.database = database
def parse_all_data(self, source_dir, dest_dir): def parse_all_data(self, source_dir):
# Ensure the destination directory exists
if not os.path.exists(dest_dir):
os.makedirs(dest_dir)
# List all files in the source and destination directories # List all files in the source and destination directories
source_files = set(os.listdir(source_dir)) source_files = set(os.listdir(source_dir))
for filename in os.listdir(dest_dir):
if filename.endswith('.csv'):
source_files.discard(filename)
dest_files = set(os.listdir(dest_dir))
# Find files that are in the source but not in the destination
missing_files = source_files - dest_files
# Copy any missing CSV files from the source directory to the destination directory # Copy any missing CSV files from the source directory to the destination directory
for filename in missing_files: for filename in source_files:
src_file = os.path.join(source_dir, filename) src_file = os.path.join(source_dir, filename)
dest_file = os.path.join(dest_dir, filename) dest_file = os.path.join(dest_dir, filename)
if self.parse_one_file(f"{source_dir}/{filename}"): if not self.parse_one_file(f"{source_dir}/{filename}"):
try: print(f"Failed to parse {source_dir}/{filename}")
shutil.copy(src_file, dest_file)
print(f"Copied {filename} to {dest_dir}")
except Exception as e:
print(f"Failed to copy {filename}: {e}")
def parse_one_file(self, filepath): def parse_one_file(self, filepath):
bb_dict = {} bb_dict = {}
@@ -61,6 +45,7 @@ class Importer:
time.sleep(60*60*24) time.sleep(60*60*24)
if not self.populate_database_with_stats(game): if not self.populate_database_with_stats(game):
print(f"Failed to parse and populate {game}")
return False return False
return True return True

12
main.py
View File

@@ -17,12 +17,14 @@ inputs = np.array([[0, 0, 1, 0],
outputs = np.array([[0], [0], [0], [1], [1], [1]]) outputs = np.array([[0], [0], [0], [1], [1], [1]])
if __name__ == '__main__': if __name__ == '__main__':
#db_file = "./database/baseball.db" db_file = "./database/baseball.db"
#db_conn = Database(db_file) db_conn = Database(db_file)
#pull_training_data(db_conn, "20240602", 0, "BAL12") query = "SELECT game_date, game_number, park_id FROM games ORDER BY game_date"
all_games = db_conn.selectall(query)
print(get_sun_and_moon_phase(39.283889, -76.621667, "20240602")) for game in all_games:
game_result, training_data = pull_training_data(db_conn, str(game[0]), game[1], game[2])
""" """
build_db_path = "./data/sql/build_db.sql" build_db_path = "./data/sql/build_db.sql"
@@ -34,7 +36,7 @@ if __name__ == '__main__':
db_conn.run_sql_file(fill_teams_path) db_conn.run_sql_file(fill_teams_path)
imp = Importer(db_conn) imp = Importer(db_conn)
imp.parse_all_data("./data/stats/to_import", "./data/stats/imported/") imp.parse_all_data("./data/stats/", "./data/stats/imported/")
""" """
""" """