import pandas as pd
import swisseph as swe
import os
import numpy as np

# Path to DTA
file_path = '../wls_b_15_2.dta'
output_csv = 'birth_order_real_data.csv'

def get_julian_day_midyear(year):
    # July 2nd is roughly mid year
    return swe.julday(year, 7, 2, 12.0)

def extract_and_process():
    print("Loading WLS data...")
    # Read columns for Grad and Siblings
    # We need:
    # idpub (ID)
    # brdxdy (Grad Birth Year)
    # bor (Grad Birth Order)
    # gk1301...gk13016 (Sibling Birth Years approx)

    # Generate list of sibling year columns
    sib_cols = [f'gk130{i}' for i in range(1, 14)] # 1 to 13
    # Check the inspection output, it goes up to gk13017 or so, but let's grab what we saw (gk1301...gk13012 etc from previous logs)
    # Actually let's just grab gk1301 through gk13012 for now as seen in logs.

    cols_to_load = ['idpub', 'brdxdy', 'bor'] + sib_cols

    # Note: Some columns might not exist if I guessed the name wrong, but inspect output showed gk1301..gk13013
    # Let's verify names or use try/except

    try:
        df = pd.read_stata(file_path, columns=cols_to_load)
    except ValueError as e:
        print(f"Column error: {e}")
        # Fallback to reading all and filtering
        df = pd.read_stata(file_path)

    print(f"Loaded {len(df)} records.")

    extracted_data = []

    # Iterate rows
    for index, row in df.iterrows():
        # 1. Add the Graduate
        g_year = row['brdxdy']
        g_order = row['bor']

        # Check validity (sometimes it's NaN or encoded as negative/categorical code)
        # In pandas categoricals, we might need to cast
        try:
            g_year = pd.to_numeric(g_year, errors='coerce')
            g_order = pd.to_numeric(g_order, errors='coerce')

            if not np.isnan(g_year) and not np.isnan(g_order) and g_year > 1900 and g_order > 0:
                extracted_data.append({
                    'family_id': row['idpub'],
                    'birth_year': int(g_year),
                    'birth_order': int(g_order),
                    'source': 'graduate'
                })
        except:
            pass

        # 2. Add the Siblings (Explicitly listed by order)
        # The variables gk1301, gk1302 correspond to 1st child, 2nd child, etc?
        # "Year of oldest sibling's birth" -> This suggests gk1301 is the 1st born sibling?
        # Or just the oldest in the list?
        # Usually "Year of oldest sibling" implies the #1 child of the family (if we include the grad?? or is grad excluded?)

        # Checking logic: "gk071re: Respondent position in sib table."
        # This implies all siblings + grad are in a table.
        # But gk1301 is "Year of oldest sibling's birth".
        # If the Grad is the oldest, is gk1301 the Grad's year? Or the oldest *other* sibling?
        # WLS documentation usually separates Grad from Siblings in these vars or includes them.

        # Let's assume for this "mass harvest" that gk130n corresponds to the Nth child in the family structure 
        # IF we can align it.
        # However, "oldest sibling" might exclude the graduate if the variable labels say "Year of oldest sibling".
        # BUT, to be safe and avoid duplicates or confusion (since we don't know if Grad is inserted in that list),
        # let's mainly rely on the Graduate's data point which is explicitly (Year, Order).
        # We have ~10,000 graduates. That's a good sample size (N=10k).
        # Expanding with siblings is better, but parsing the "who is who" without the mix is risky.

        # HOWEVER, the user asked to "find n-th born data".
        # If we just use graduates, we have a bias: WLS graduates are high school grads from Wisconsin in 1957.
        # They are all ~18 years old in 1957, so born ~1939.
        # This means for Birth Order 1, Year is 1939.
        # For Birth Order 5, Year is 1939.
        # This is CRITICAL: We cannot separate "Birth Order" from "Generational Astrology" if everyone is born in 1939.
        # A 1st born in 1939 has Saturn in Aries/Taurus.
        # A 5th born in 1939 has Saturn in Aries/Taurus.
        # The planetary positions will be IDENTICAL for all birth orders in a single-year cohort.

        # CONCLUSION: WE MUST USE THE SIBLINGS.
        # The siblings have different birth years.
        # gk1301 (Oldest) might be born 1935.
        # gk1305 (5th) might be born 1945.
        # We need to use the sibling arrays to get variation in birth years.

        # Let's extract siblings assuming gk130N is the Nth sibling in the roster.
        # The variables were "Year of oldest sibling's birth" (gk1301), "Year of 2nd oldest..." (gk1302).
        # This strongly implies strictly ordered from 1..N.
        # Does this list include the graduate?
        # Variable "gk071re: Respondent position in sib table" suggests the grad is IN the table or we know where they fit.
        # Actually, let's just take all valid years from gk1301...gk13012.
        # And assign them Birth Order 1...12.
        # This assumes gk1301 is the ACTUAL 1st born.
        # Label: "Year of oldest sibling's birth".
        # If I have 5 siblings and I am the 3rd, and they ask "Year of oldest sibling", do they mean my older brother (the 1st)? Yes.
        # So gk1301 = Child 1. gk1302 = Child 2.

        for i in range(1, 13):
            col_name = f'gk130{i}'
            if col_name in row:
                s_year = row[col_name]
                try:
                    s_year = pd.to_numeric(s_year, errors='coerce')
                    if not np.isnan(s_year) and s_year > 1890 and s_year < 2000:
                        extracted_data.append({
                            'family_id': row['idpub'],
                            'birth_year': int(s_year),
                            'birth_order': i,  # Assuming index i is birth order
                            'source': 'sibling_roster'
                        })
                except:
                    continue

    # Convert to DataFrame
    final_df = pd.DataFrame(extracted_data)

    # Remove duplicates if any (e.g. if we accidentally double counted)
    final_df.drop_duplicates(inplace=True)

    print(f"Extracted {len(final_df)} individual birth records.")
    print(final_df['birth_order'].value_counts().sort_index().head(10))
    print(final_df['birth_year'].describe())

    final_df.to_csv(output_csv, index=False)
    print(f"Saved to {output_csv}")

if __name__ == "__main__":
    extract_and_process()