analysis.py · The Big Astrology Book of Research

import pandas as pd
import numpy as np
import os
import glob
import matplotlib.pyplot as plt
from datetime import datetime
import concurrent.futures
import time

def process_weather_chunk(args):
    """
    Process a list of weather files and return aggregation.
    args: (file_list, valid_dates_set)
    """
    files, valid_dates_list = args
    # Create valid_dates_set for fast lookup
    # Actually, we want to align to a common index.
    # Let's create a local series for alignment

    # Initialize accumulators
    # We use a dictionary for sparse accumulation then convert to Series
    # Or just use a common index if we pass it? Passing index is heavy.
    # Let's return a dict: {date: [sum_p, count_p, sum_t, count_t]}
    # Or better: {date: (sum_p, count_p, sum_t, count_t)}

    local_data = {}

    valid_dates = set(valid_dates_list)

    for f in files:
        try:
            # Read only needed cols
            df = pd.read_csv(f, usecols=['DATE', 'PRCP', 'TMAX'])
            df = df.dropna(subset=['DATE'])

            # Filter rows
            df = df[df['DATE'].isin(valid_dates)]
            if df.empty:
                continue

            # Iterate rows is slow. Vectorized?
            # Group by DATE and sum (handle duplicates if any)
            # Actually simplest: set index date.

            # Prcp
            p_df = df[['DATE', 'PRCP']].dropna().set_index('DATE')
            t_df = df[['DATE', 'TMAX']].dropna().set_index('DATE')

            # Iterate unique dates present in this file
            # This is the slow part in Python loop. 
            # But accumulating into a big aligned DF is fast if we use add.
            # But we can't create a big DF in every worker.

            # Optimization: Return the raw summed frames for this chunk? 
            # If chunk is 500 files.
            # We can aggregate inside the chunk using a DataFrame.
            pass
        except:
            continue

    # BETTER APPROACH for Chunking:
    # Initialize a DF with the index
    chunk_idx = pd. Index(valid_dates_list)
    chunk_agg = pd.DataFrame(0.0, index=chunk_idx, columns=['sum_p', 'count_p', 'sum_t', 'count_t'])

    for f in files:
        try:
            df = pd.read_csv(f, usecols=['DATE', 'PRCP', 'TMAX'])
            df = df[df['DATE'].isin(valid_dates)]

            p = df[['DATE', 'PRCP']].dropna().set_index('DATE')
            if not p.empty:
                # Group by date to handle dups
                p = p.groupby('DATE')['PRCP'].sum() # Sum duplicates? Or mean? daily summary implies one.
                # Actually, duplicate dates in one file is rare/error.

                # Align and add
                # Reindex to ensure alignment (fast if mostly aligned)
                # Intersection logic
                common = p.index.intersection(chunk_agg.index)
                if not common.empty:
                    chunk_agg.loc[common, 'sum_p'] += p.loc[common]
                    chunk_agg.loc[common, 'count_p'] += 1

            t = df[['DATE', 'TMAX']].dropna().set_index('DATE')
            if not t.empty:
                # Rename to avoid confusion with agg cols
                common_t = t.index.intersection(chunk_agg.index)
                if not common_t.empty:
                    chunk_agg.loc[common_t, 'sum_t'] += t.loc[common_t]['TMAX']
                    chunk_agg.loc[common_t, 'count_t'] += 1

        except Exception:
            continue

    return chunk_agg

class GlobalWeatherAnalyzer:
    def __init__(self, data_dir, astro_file, metadata_file=None):
        self.data_dir = data_dir
        self.astro_file = astro_file
        self.metadata_file = metadata_file
        self.astro_df = None
        self.weather_df = None
        self.station_latitudes = {}

    def load_metadata(self):
        if not self.metadata_file or not os.path.exists(self.metadata_file):
            print("Metadata file not found. Skipping latitude filter.")
            return

        print(f"Loading station metadata from {self.metadata_file}...")
        # Dictionary id -> lat
        count = 0
        with open(self.metadata_file, 'r', encoding='utf-8') as f:
            for line in f:
                parts = line.split()
                if len(parts) >= 2:
                    st_id = parts[0]
                    try:
                        lat = float(parts[1])
                        self.station_latitudes[st_id] = lat
                        count += 1
                    except ValueError:
                        continue
        print(f"Loaded {count} station latitudes.")

    def load_astro_data(self):
        print("Loading astrological features...")
        self.astro_df = pd.read_csv(self.astro_file)
        # Ensure unique dates
        self.astro_df = self.astro_df.drop_duplicates(subset=['date'])

    def aggregate_weather_data(self, file_limit=None, hemisphere=None):
        # Determine prefix for checkpoint/output based on hemisphere
        hemisphere_tag = "Full"
        if hemisphere == 'north': hemisphere_tag = "North"
        if hemisphere == 'south': hemisphere_tag = "South"

        # Check for checkpoint if full run
        checkpoint_file = os.path.join(self.data_dir, f'../weather_data_{hemisphere_tag}_checkpoint.csv')
        if file_limit is None and os.path.exists(checkpoint_file):
             print(f"Loading {hemisphere_tag} data from checkpoint: {checkpoint_file}")
             self.weather_df = pd.read_csv(checkpoint_file)
             return

        files = glob.glob(os.path.join(self.data_dir, "*.csv"))

        if hemisphere and self.station_latitudes:
            print(f"Filtering stations for {hemisphere} hemisphere...")
            filtered_files = []
            for f in files:
                filename = os.path.basename(f)
                st_id = filename.replace('.csv', '')
                lat = self.station_latitudes.get(st_id)

                if lat is not None:
                    if hemisphere == 'north' and lat > 0:
                        filtered_files.append(f)
                    elif hemisphere == 'south' and lat < 0:
                        filtered_files.append(f)

            files = filtered_files
            print(f"Found {len(files)} stations in {hemisphere} hemisphere.")

        total_files = len(files)

        if file_limit:
            files = files[:file_limit]
            print(f"Processing subset of {len(files)} weather stations (User Limit)...")
        else:
            print(f"Processing ALL {len(files)} weather stations...")

        dates = self.astro_df['date'].values

        # Determine chunk size
        # 130k files. 
        # If we have 4 cores, we want maybe 20 chunks to keep load balanced?
        # Or just fixed size. 1000 files per chunk.
        chunk_size = 2000
        chunks = [files[i:i + chunk_size] for i in range(0, len(files), chunk_size)]

        print(f"Split into {len(chunks)} chunks of ~{chunk_size} files.")
        print("Starting parallel aggregation...")

        # Accumulator
        grand_agg = pd.DataFrame(0.0, index=dates, columns=['sum_p', 'count_p', 'sum_t', 'count_t'])

        start_time = time.time()

        # Use ProcessPoolExecutor
        # Note: We pass dates list, which is pickled. It's not too big.

        # Function arguments must be picklable
        worker_args = [(chunk, dates) for chunk in chunks]

        with concurrent.futures.ProcessPoolExecutor() as executor:
            # Map returns results in order
            for i, chunk_result in enumerate(executor.map(process_weather_chunk, worker_args)):
                # Add to grand total
                # Indices match exactly because we passed dates
                grand_agg += chunk_result

                elapsed = time.time() - start_time
                progress = (i + 1) / len(chunks) if len(chunks) > 0 else 1
                print(f"Processed Chunk {i+1}/{len(chunks)} ({progress:.1%}) - Elapsed: {elapsed:.0f}s")

        print(f"Finished aggregation. Total time: {time.time() - start_time:.1f}s")

        # Calculate daily averages
        # Avoid division by zero
        grand_agg['avg_prcp'] = grand_agg['sum_p'].divide(grand_agg['count_p']).replace([np.inf, -np.inf], 0)
        grand_agg['avg_tmax'] = grand_agg['sum_t'].divide(grand_agg['count_t']).replace([np.inf, -np.inf], 0)

        self.weather_df = grand_agg.reset_index().rename(columns={'index': 'date'})

        # Save checkpoint
        self.weather_df.to_csv(checkpoint_file, index=False)
        print(f"Checkpoint saved to {checkpoint_file}")

    def analyze_correlations(self):
        print("Merging astro and weather data...")
        # Merge
        merged = pd.merge(self.weather_df, self.astro_df, on='date', how='inner')

        # Features to check
        # List of lists [Body, System]
        bodies = ['Sun', 'Moon', 'Mercury', 'Venus', 'Mars', 'Jupiter', 
                 'Saturn', 'Uranus', 'Neptune', 'Pluto', 'NorthNode', 'SouthNode']
        systems = ['Trop', 'Vedic']

        results = []

        # Map for Elements
        # 0: Fire, 1: Earth, 2: Air, 3: Water
        # Sign % 4 gives: 0 (Aries-Fire), 1 (Taurus-Earth), 2 (Gem-Air), 3 (Can-Water) -> Wait.
        # Fire: 0, 4, 8. 0%4=0, 4%4=0, 8%4=0. Correct.
        # Earth: 1, 5, 9. 1%4=1. Correct.
        # Air: 2, 6, 10. 2%4=2. Correct.
        # Water: 3, 7, 11. 3%4=3. Correct.
        elements = ['Fire', 'Earth', 'Air', 'Water']

        print("Running statistical analysis...")

        for body in bodies:
            for sys in systems:
                sign_col = f'{body}_{sys}_Sign'

                if sign_col not in merged.columns:
                    continue

                # 1. Analyze by Sign (0-11)
                grp = merged.groupby(sign_col)[['avg_prcp', 'avg_tmax']].mean()

                # Check for "Water Sign" effect specifically
                # Water signs are 3, 7, 11
                water_signs = [3, 7, 11]
                water_mask = merged[sign_col].isin(water_signs)

                avg_p_water = merged[water_mask]['avg_prcp'].mean()
                avg_p_non_water = merged[~water_mask]['avg_prcp'].mean()

                avg_t_water = merged[water_mask]['avg_tmax'].mean()

                results.append({
                    'Feature': f'{body} ({sys})',
                    'Water_Prcp': avg_p_water,
                    'NonWater_Prcp': avg_p_non_water,
                    'Diff_Prcp': avg_p_water - avg_p_non_water,
                    'Effect_Type': 'Sign'
                })

                # 2. Analyze by Element
                # Create element column on the fly
                merged['temp_elem'] = merged[sign_col] % 4
                elem_grp = merged.groupby('temp_elem')[['avg_prcp', 'avg_tmax']].mean()

                for elem_id, row in elem_grp.iterrows():
                    elem_name = elements[int(elem_id)]
                    results.append({
                        'Feature': f'{body} ({sys})',
                        'Element': elem_name,
                        'Avg_Prcp': row['avg_prcp'],
                        'Avg_Tmax': row['avg_tmax'],
                        'Effect_Type': 'Element_Avg'
                    })

        return pd.DataFrame(results)

def main():
    current_dir = os.path.dirname(__file__)
    data_dir = os.path.join(current_dir, 'daily-summaries-latest')
    astro_file = os.path.join(current_dir, 'astro_weather_features.csv')
    metadata_file = os.path.join(current_dir, 'ghcnd-stations.txt')

    analyzer = GlobalWeatherAnalyzer(data_dir, astro_file, metadata_file)
    analyzer.load_astro_data()
    analyzer.load_metadata()

    # Run loop for hemispheres
    hemispheres = ['north', 'south']

    for hem in hemispheres:
        print(f"\n\n====================================")
        print(f"STARTING ANALYSIS: {hem.upper()} HEMISPHERE")
        print(f"====================================")

        # Reset weather_df to force reload/re-aggregate
        analyzer.weather_df = None

        # Using full data but filtered by hemisphere
        analyzer.aggregate_weather_data(file_limit=None, hemisphere=hem)

        results = analyzer.analyze_correlations()

        out_csv = os.path.join(current_dir, f'full_analysis_results_{hem}.csv')
        results.to_csv(out_csv, index=False)
        print(f"Results saved to {out_csv}")

        # Filter for Water Signals
        water_stats = results[results['Effect_Type'] == 'Sign'].sort_values('Diff_Prcp', ascending=False)
        print(f"\n--- Top Water Sign Precipitation Increases ({hem.upper()} Hemisphere) ---")
        print(water_stats[['Feature', 'Water_Prcp', 'Diff_Prcp']].head(5))

        # Filter for Element Averages
        elem_stats = results[results['Effect_Type'] == 'Element_Avg']
        # Pivot to see Body vs Element
        print(f"\n--- Element Precipitation Averages ({hem.upper()}) ---")
        pivot = elem_stats.pivot(index='Feature', columns='Element', values='Avg_Prcp')
        print(pivot)

        # Save pivot
        pivot.to_csv(os.path.join(current_dir, f'element_precipitation_pivot_{hem}.csv'))

if __name__ == "__main__":
    main()