import pandas as pd
import requests
import time
import sys

# Settings
INPUT_FILE = 'person_2025_update.csv'
OUTPUT_FILE = 'pantheon_with_birth_order.csv'
BATCH_SIZE = 50
LIMIT = 500  # Process top 500 famous people for now to avoid timeout
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"

def get_sparql_query(qids):
    values = " ".join([f"wd:{qid}" for qid in qids])
    query = f"""
    SELECT ?person ?personLabel ?birthDate ?sibling ?siblingBirthDate WHERE {{
      VALUES ?person {{ {values} }}
      ?person wdt:P31 wd:Q5 .
      ?person wdt:P569 ?birthDate . 
      OPTIONAL {{ 
        ?person wdt:P3373 ?sibling .
        ?sibling wdt:P569 ?siblingBirthDate .
      }}
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
    }}
    """
    return query

def fetch_data(df):
    results = []

    # Sort by HPI to get most famous
    df = df.sort_values(by='hpi', ascending=False).head(LIMIT)
    qids = df['wd_id'].tolist()

    print(f"Fetching data for top {len(qids)} people...")

    for i in range(0, len(qids), BATCH_SIZE):
        batch = qids[i:i+BATCH_SIZE]
        print(f"Processing batch {i} to {i+len(batch)}...")

        try:
            query = get_sparql_query(batch)
            response = requests.get(SPARQL_ENDPOINT, params={'query': query, 'format': 'json'})

            if response.status_code == 429:
                print("Rate limited, waiting...")
                time.sleep(10)
                continue

            data = response.json()

            # Process bindings
            family_map = {} # QID -> {birth: date, siblings: [(qid, date)]}

            for item in data['results']['bindings']:
                p_url = item['person']['value']
                p_qid = p_url.split('/')[-1]
                p_bdate = item['birthDate']['value']

                if p_qid not in family_map:
                    family_map[p_qid] = {'birth': p_bdate, 'siblings': []}

                if 'sibling' in item and 'siblingBirthDate' in item:
                    s_url = item['sibling']['value']
                    s_qid = s_url.split('/')[-1]
                    s_bdate = item['siblingBirthDate']['value']
                    family_map[p_qid]['siblings'].append({'id': s_qid, 'birth': s_bdate})

            # Calculate birth order
            for qid, info in family_map.items():
                my_birth = info['birth']
                sibs = info['siblings']

                if not sibs:
                    # No siblings found in Wikidata implies either Only Child OR missing data.
                    # We can't be sure. But let's mark as "Unknown" or skip.
                    # User asked to "isolate rows that have sibling information".
                    # If siblings list is empty, we don't have INFO about siblings (existence or non-existence is ambiguous).
                    # Actually, if P3373 is missing, it's ambiguous.
                    # But the query returns a row if birthdate exists.
                    continue

                # Filter valid dates
                valid_sibs = [s for s in sibs if s['birth'] and not s['birth'].startswith('t')] # simple check

                # Count older siblings
                older_count = 0
                for s in valid_sibs:
                    if s['birth'] < my_birth:
                        older_count += 1

                birth_order = older_count + 1
                total_siblings = len(valid_sibs) + 1

                results.append({
                    'wd_id': qid,
                    'birth_order': birth_order,
                    'total_siblings': total_siblings,
                    'sibling_data_count': len(valid_sibs)
                })

            time.sleep(1) # Be nice to API

        except Exception as e:
            print(f"Error in batch: {e}")

    # Merge with original data
    if not results:
        print("No sibling data found.")
        return

    res_df = pd.DataFrame(results)
    final_df = df.merge(res_df, on='wd_id', how='inner')

    print(f"Enriched {len(final_df)} records with birth order.")
    final_df.to_csv(OUTPUT_FILE, index=False)
    print(f"Saved to {OUTPUT_FILE}")

if __name__ == "__main__":
    df = pd.read_csv(INPUT_FILE)
    fetch_data(df)