#!/usr/bin/env python3
"""
Couples Data Collector from Wikidata
------------------------------------
Fetches couples (spouses) where both partners have known birth dates.
Used for Projects 10, 26, 36.
"""
import requests
import pandas as pd
import time
from pathlib import Path
OUTPUT_FILE = Path("couples_data_wikidata.csv")
def fetch_wikidata_couples():
print("Querying Wikidata for couples...")
# SPARQL Query
# Logic: Person1 has Spouse Person2. Get Birthdates and Deathdates.
# Filter: ID(P1) < ID(P2) to avoid A-B and B-A duplicates.
# Filter: Birth year > 1800 for data quality.
query = """
SELECT DISTINCT
?person1 ?person1Label ?birth1 ?death1
?person2 ?person2Label ?birth2 ?death2
?start ?end
WHERE {
?person1 wdt:P31 wd:Q5; # P1 is Human
wdt:P569 ?birth1. # P1 has Birth Date
OPTIONAL { ?person1 wdt:P570 ?death1. } # P1 Death
?person1 p:P26 ?stmt. # P1 has Spouse Statement
?stmt ps:P26 ?person2. # Spouse is P2
?person2 wdt:P31 wd:Q5; # P2 is Human
wdt:P569 ?birth2. # P2 has Birth Date
OPTIONAL { ?person2 wdt:P570 ?death2. } # P2 Death
OPTIONAL { ?stmt pq:P580 ?start. } # Marriage Start
OPTIONAL { ?stmt pq:P582 ?end. } # Marriage End (Divorce/Death)
# Filter for modern-ish history (better documentation)
FILTER(YEAR(?birth1) > 1800 && YEAR(?birth2) > 1800)
# Semantic constraint: Only pair them once
FILTER(STR(?person1) < STR(?person2))
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
LIMIT 4000
"""
url = "https://query.wikidata.org/sparql"
headers = {
'User-Agent': 'AstrologyResearch/1.0 (academic research attempt)',
'Accept': 'application/json'
}
try:
response = requests.get(url, params={'query': query, 'format': 'json'}, headers=headers, timeout=60)
response.raise_for_status()
data = response.json()
except Exception as e:
print(f"Error fetching data: {e}")
return None
results = []
for item in data['results']['bindings']:
try:
p1_name = item['person1Label']['value']
p1_birth = item['birth1']['value'].split('T')[0]
p1_death = item.get('death1', {}).get('value', '').split('T')[0]
p2_name = item['person2Label']['value']
p2_birth = item['birth2']['value'].split('T')[0]
p2_death = item.get('death2', {}).get('value', '').split('T')[0]
start_date = item.get('start', {}).get('value', '').split('T')[0]
end_date = item.get('end', {}).get('value', '').split('T')[0]
# Status Logic
# Just pass raw dates; let analysis decide status
results.append({
'p1_name': p1_name,
'p1_birth_date': p1_birth,
'p1_birth_time': None,
'p1_death_date': p1_death,
'p2_name': p2_name,
'p2_birth_date': p2_birth,
'p2_birth_time': None,
'p2_death_date': p2_death,
'start_date': start_date,
'end_date': end_date
})
except Exception as e:
continue
df = pd.DataFrame(results)
# Filter out entries where names are Q-ids (label fetch failed) or numeric
# Usually label service handles this, but sometimes glitches.
print(f"Fetched {len(df)} couples.")
return df
if __name__ == "__main__":
df = fetch_wikidata_couples()
if df is not None and not df.empty:
# Save to Project 10 folder
proj10_path = Path("10-synastry-relationship-longevity/new_couples_wikidata.csv")
df.to_csv(proj10_path, index=False)
print(f"Saved to {proj10_path}")
# Also save to Project 26
proj26_path = Path("26-compatibility-relationship-satisfaction/new_couples_wikidata.csv")
# Ensure dir exists
proj26_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(proj26_path, index=False)
print(f"Saved to {proj26_path}")
# Also save to Project 36
proj36_path = Path("36-synastry-harmonics-logistic/new_couples_wikidata.csv")
proj36_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(proj36_path, index=False)
print(f"Saved to {proj36_path}")