import pandas as pd
import numpy as np
import scipy.stats as stats
import os
import glob
def run_stats():
print("Initializing Statistical Verification...")
# Paths
base_dir = os.path.dirname(os.path.abspath(__file__))
data_dir = os.path.join(base_dir, 'daily-summaries-latest')
astro_file = os.path.join(base_dir, 'astro_weather_features.csv')
# Load Astro Data
print("Loading astro features...")
astro_df = pd.read_csv(astro_file)
dates = astro_df['date'].values
# Initialize Aggregation DF
agg_df = pd.DataFrame(index=dates, columns=['sum_p', 'count_p']).fillna(0.0)
# Scan Weather Files
files = glob.glob(os.path.join(data_dir, "*.csv"))
# Use 500 files for specific variance check (robust enough for T-test on daily means)
sample_size = 500
files = files[:sample_size]
print(f"Aggregating weather from {len(files)} stations...")
processed = 0
for f in files:
try:
df = pd.read_csv(f, usecols=['DATE', 'PRCP'])
df = df.dropna(subset=['DATE', 'PRCP'])
df = df[df['DATE'].isin(agg_df.index)]
if df.empty: continue
p_sums = df.set_index('DATE')['PRCP']
# Align indices
common_dates = p_sums.index.intersection(agg_df.index)
if common_dates.empty: continue
agg_df.loc[common_dates, 'sum_p'] += p_sums.loc[common_dates]
agg_df.loc[common_dates, 'count_p'] += 1
processed += 1
if processed % 100 == 0:
print(f"Processed {processed}...")
except:
continue
# Calculate Mean Daily Prcp
agg_df['avg_prcp'] = agg_df['sum_p'] / agg_df['count_p']
weather_df = agg_df.reset_index().rename(columns={'index': 'date'}).dropna(subset=['avg_prcp'])
print(f"Weather processed. Valid daily records: {len(weather_df)}")
# Merge
merged = pd.merge(weather_df, astro_df, on='date', how='inner')
# --- TEST 1: Sun (Tropical) in Water ---
print("\n--- TEST 1: SUN (TROPICAL) WATER SIGN EFFECT ---")
water_signs = [3, 7, 11] # Cancer, Scorpio, Pisces
sun_water_days = merged[merged['Sun_Trop_Sign'].isin(water_signs)]['avg_prcp']
sun_non_water_days = merged[~merged['Sun_Trop_Sign'].isin(water_signs)]['avg_prcp']
print(f"Water Days Count: {len(sun_water_days)}")
print(f"Non-Water Days Count: {len(sun_non_water_days)}")
print(f"Water Mean: {sun_water_days.mean():.2f}")
print(f"Non-Water Mean: {sun_non_water_days.mean():.2f}")
t_stat, p_val = stats.ttest_ind(sun_water_days, sun_non_water_days, equal_var=False)
print(f"T-Statistic: {t_stat:.4f}")
print(f"P-Value: {p_val:.4e}")
if p_val < 0.05:
print("RESULT: Statistically Significant (p < 0.05)")
else:
print("RESULT: Not Significant")
# --- TEST 2: Moon (Tropical) in Water ---
print("\n--- TEST 2: MOON (TROPICAL) WATER SIGN EFFECT ---")
moon_water_days = merged[merged['Moon_Trop_Sign'].isin(water_signs)]['avg_prcp']
moon_non_water_days = merged[~merged['Moon_Trop_Sign'].isin(water_signs)]['avg_prcp']
print(f"Water Days Count: {len(moon_water_days)}")
print(f"Non-Water Days Count: {len(moon_non_water_days)}")
print(f"Water Mean: {moon_water_days.mean():.2f}")
print(f"Non-Water Mean: {moon_non_water_days.mean():.2f}")
t_stat_m, p_val_m = stats.ttest_ind(moon_water_days, moon_non_water_days, equal_var=False)
print(f"T-Statistic: {t_stat_m:.4f}")
print(f"P-Value: {p_val_m:.4f}")
if __name__ == "__main__":
run_stats()