Moving around csvs and better handling of duplicate data. Pulling all relevant data

2025-09-15 18:46:37 +00:00
parent 3950e99151
commit 125d133af5
9 changed files with 2450 additions and 33 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -2,5 +2,4 @@
 .Trash-1000
 target
 database
-data/stats/imported
 data/__pycache__
--- a/data/db_connect.py
+++ b/data/db_connect.py
@@ -23,10 +23,15 @@ class Database:
        else:
            return None

-    def selectall(self, query, values):
+    def selectall(self, query, values = None):
        # Query the database for the specified index
        cursor = self.db.cursor()
+
+        if values is None:
+            cursor.execute(query)
+        else:
            cursor.execute(query, values)
+
        result = cursor.fetchall()
        if result:
            return result
--- a/data/get_data.py
+++ b/data/get_data.py
@@ -253,8 +253,6 @@ def pull_training_data(database: Database, game_date: str, game_number: int, par
        print(f"Failed to get game data for date: {game_date}, number: {game_number}, park: {park_id}")
        return None

-    print(curr_game)
-
    select_teams = """
        SELECT
            win, team, home,
@@ -400,6 +398,4 @@ def pull_training_data(database: Database, game_date: str, game_number: int, par
        ]
    training_data = [*training_data, *prev_game_data]

-    print(f"{training_result}\n{training_data}")
-
    return (training_result, training_data)
--- a/data/stats/to_import/gl2022.csv
+++ b/data/stats/to_import/gl2022.csv
--- a/data/stats/to_import/gl2023.csv
+++ b/data/stats/to_import/gl2023.csv
--- a/data/stats/to_import/gl2024.csv
+++ b/data/stats/to_import/gl2024.csv
--- a/data/stats/gl2024.csv
+++ b/data/stats/gl2024.csv
--- a/data/stats_importer.py
+++ b/data/stats_importer.py
@@ -11,33 +11,17 @@ class Importer:
    def __init__(self, database: Database):
        self.database = database

-    def parse_all_data(self, source_dir, dest_dir):
-        # Ensure the destination directory exists
-        if not os.path.exists(dest_dir):
-            os.makedirs(dest_dir)
-
+    def parse_all_data(self, source_dir):
        # List all files in the source and destination directories
        source_files = set(os.listdir(source_dir))
-        for filename in os.listdir(dest_dir):
-            if filename.endswith('.csv'):
-                source_files.discard(filename)
-
-        dest_files = set(os.listdir(dest_dir))
-
-        # Find files that are in the source but not in the destination
-        missing_files = source_files - dest_files
        
        # Copy any missing CSV files from the source directory to the destination directory
-        for filename in missing_files:
+        for filename in source_files:
            src_file = os.path.join(source_dir, filename)
            dest_file = os.path.join(dest_dir, filename)

-            if self.parse_one_file(f"{source_dir}/{filename}"):
-                try:
-                    shutil.copy(src_file, dest_file)
-                    print(f"Copied {filename} to {dest_dir}")
-                except Exception as e:
-                    print(f"Failed to copy {filename}: {e}")
+            if not self.parse_one_file(f"{source_dir}/{filename}"):
+                print(f"Failed to parse {source_dir}/{filename}")

    def parse_one_file(self, filepath):
        bb_dict = {}
@@ -61,6 +45,7 @@ class Importer:
                time.sleep(60*60*24)
                
            if not self.populate_database_with_stats(game):
+                print(f"Failed to parse and populate {game}")
                return False            
        
        return True
--- a/main.py
+++ b/main.py
@@ -17,12 +17,14 @@ inputs = np.array([[0, 0, 1, 0],
 outputs = np.array([[0], [0], [0], [1], [1], [1]])

 if __name__ == '__main__':
-    #db_file = "./database/baseball.db"
-    #db_conn = Database(db_file)
+    db_file = "./database/baseball.db"
+    db_conn = Database(db_file)

-    #pull_training_data(db_conn, "20240602", 0, "BAL12")
+    query = "SELECT game_date, game_number, park_id FROM games ORDER BY game_date"
+    all_games = db_conn.selectall(query)

-    print(get_sun_and_moon_phase(39.283889, -76.621667, "20240602"))
+    for game in all_games:
+        game_result, training_data = pull_training_data(db_conn, str(game[0]), game[1], game[2])

    """
    build_db_path = "./data/sql/build_db.sql"
@@ -34,7 +36,7 @@ if __name__ == '__main__':
    db_conn.run_sql_file(fill_teams_path)

    imp = Importer(db_conn)
-    imp.parse_all_data("./data/stats/to_import", "./data/stats/imported/")
+    imp.parse_all_data("./data/stats/", "./data/stats/imported/")
    """

 """