remove_duplicates.py · The Big Astrology Book of Research

import pandas as pd
import os

def clean_file(filename):
    if not os.path.exists(filename):
        print(f"File not found: {filename}")
        return

    print(f"Cleaning {filename}...")
    try:
        df = pd.read_csv(filename)
        original_count = len(df)

        # Get first 3 columns
        subset_cols = df.columns[:3].tolist()
        print(f"  Deduplicating based on: {subset_cols}")

        df_clean = df.drop_duplicates(subset=subset_cols, keep='first')
        new_count = len(df_clean)

        print(f"  Removed {original_count - new_count} duplicates. {new_count} records remain.")
        df_clean.to_csv(filename, index=False)

    except Exception as e:
        print(f"  Error cleaning {filename}: {e}")

if __name__ == "__main__":
    # Clean wealthy_birthdata_v2.csv
    clean_file("wealthy_birthdata_v2.csv")

    # Clean 1000_richest_people_in_the_world.csv
    clean_file("1000_richest_people_in_the_world.csv")