import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
# Setup paths
OUTPUT_DIR = Path(__file__).parent
DATA_FILE = OUTPUT_DIR / "BIG5_data.csv"
def load_and_analyze():
print(f"Loading {DATA_FILE}...")
try:
df = pd.read_csv(DATA_FILE, sep='\t')
except Exception as e:
print(f"Error loading data: {e}. Trying default pandas read...")
df = pd.read_csv(DATA_FILE) # Fallback
# Filter reasonable ages
df = df[df['age'].between(13, 90)]
# Calculate Conscientiousness Score
# Reverse keys for C: 2, 4, 6, 8 (based on code in analysis_cyclic.py)
reverse_keyed_C = [2, 4, 6, 8]
# Calculate score
score = pd.Series(0, index=df.index)
valid_cols = True
for i in range(1, 11):
col = f'C{i}'
if col not in df.columns:
print(f"Column {col} missing")
valid_cols = False
break
val = df[col]
if i in reverse_keyed_C:
val = 6 - val # Reverse 1-5 scale
score += val
if not valid_cols:
return
df['Score_C'] = score
# 1. Calculate Expected Score based on Age (Detrending)
# Using a polynomial to capture the curved maturation process better than linear
poly = PolynomialFeatures(degree=2)
X_age = poly.fit_transform(df[['age']])
model = LinearRegression().fit(X_age, df['Score_C'])
df['Expected_C'] = model.predict(X_age)
df['Residual_C'] = df['Score_C'] - df['Expected_C']
# 2. Calculate Saturn Phase
saturn_period = 29.457
df['Saturn_Phase'] = (df['age'] % saturn_period) / saturn_period
# 3. Bin the phases (20 bins = 0.05 width per bin)
df['Phase_Bin'] = pd.cut(df['Saturn_Phase'], bins=20, labels=False)
# 4. Analyze the bins
results = df.groupby('Phase_Bin').agg({
'Score_C': 'mean',
'Expected_C': 'mean',
'Residual_C': 'mean',
'age': 'mean',
'Score_C': ['mean', 'count', 'std']
})
# Flatten columns
results.columns = ['_'.join(col).strip() for col in results.columns.values]
results = results.reset_index()
print("\n--- SATURN PHASE ANALYSIS (CONSCIENTIOUSNESS) ---")
print(f"{'Bin':<5} | {'Phase Range':<12} | {'Mean Age':<8} | {'Raw Score':<10} | {'Residual':<10} | {'Count':<6}")
print("-" * 75)
for _, row in results.iterrows():
bin_idx = int(row['Phase_Bin'])
phase_center = (bin_idx * 0.05) + 0.025
phase_start = bin_idx * 0.05
phase_end = (bin_idx + 1) * 0.05
# Determine if this is the "Opposition" region (0.45 - 0.55)
marker = ""
if 0.45 <= phase_center <= 0.55:
marker = " <--- OPPOSITION (Halfway)"
elif 0.0 <= phase_center <= 0.05 or 0.95 <= phase_center <= 1.0:
marker = " <--- RETURN (Start/End)"
# Note: Raw Score mean depends heavily on age, Residual is the key metric
# We need to re-calculate residual mean manually here as agg above might have messed up col names?
# Actually let's just use the dataframe mean for this bin from original df
bin_mask = df['Phase_Bin'] == bin_idx
res_mean = df.loc[bin_mask, 'Residual_C'].mean()
raw_mean = df.loc[bin_mask, 'Score_C'].mean()
age_mean = df.loc[bin_mask, 'age'].mean()
count = df.loc[bin_mask, 'Score_C'].count()
print(f"{bin_idx:<5} | {phase_start:.2f}-{phase_end:.2f} | {age_mean:.1f} | {raw_mean:.4f} | {res_mean:+.4f} | {count:<6}{marker}")
if __name__ == "__main__":
load_and_analyze()