#!/usr/bin/env python3
"""
Generate synthetic horary data for testing the pipeline.
Generates 1000 random chart timestamps between 2000 and 2025.
Assigns random outcomes (50/50) to serve as a CONTROL group.
"""
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
def generate_synthetic_data(n_samples=1000):
start_date = datetime(2000, 1, 1)
end_date = datetime(2025, 1, 1)
range_days = (end_date - start_date).days
records = []
question_types = ['lost_item', 'relationship', 'job', 'health', 'journey', 'lawsuit']
for _ in range(n_samples):
# Random date
random_days = random.randrange(range_days)
random_seconds = random.randrange(86400)
dt = start_date + timedelta(days=random_days, seconds=random_seconds)
# Random outcome (Base rate 50%)
# NOTE: In a real study, this column would be user-supplied truth.
outcome = random.choice([True, False])
# Random Question Type
q_type = random.choice(question_types)
records.append({
'question_type': q_type,
'year': dt.year,
'month': dt.month,
'day': dt.day,
'hour': dt.hour + dt.minute/60.0, # Decimal hour for swisseph
'outcome': outcome
})
df = pd.DataFrame(records)
output_path = 'synthetic_horary_data.csv'
df.to_csv(output_path, index=False)
print(f"Generated {n_samples} synthetic records to {output_path}")
if __name__ == '__main__':
generate_synthetic_data(600)