import os
import pandas as pd
from pathlib import Path
DATA_DIR = Path("/home/rko/Astrology Projects/Big Book of Astrology Research/22-astro-weather-meteorological/daily-summaries-latest")
def get_station_info(file_path):
try:
# Read just the first row to get lat/lon
df = pd.read_csv(file_path, nrows=1)
if 'LATITUDE' in df.columns:
return df['LATITUDE'].iloc[0], file_path.stat().st_size
except:
pass
return None, 0
def select_stations():
north_files = []
south_files = []
# 1. Get Top North Files
# (We know the largest ones are North, so this is easy)
all_files = sorted(DATA_DIR.glob("*.csv"), key=lambda x: x.stat().st_size, reverse=True)
print("Selecting North Stations...")
for file_path in all_files:
lat, size = get_station_info(file_path)
if lat is not None and lat > 20:
north_files.append((file_path, size, lat))
if len(north_files) >= 5:
break
# 2. Get Top South Files
# Look for specific prefixes first for efficiency: ZA (South Africa), AR (Argentina), AS (Australia)
# The list showed ZA, ZI, UY.
print("Selecting South Stations...")
south_candidates = list(DATA_DIR.glob("ZA*.csv")) + list(DATA_DIR.glob("ZI*.csv")) + list(DATA_DIR.glob("UY*.csv")) + list(DATA_DIR.glob("AS*.csv")) + list(DATA_DIR.glob("WA*.csv"))
# Sort candidates by size
south_candidates.sort(key=lambda x: x.stat().st_size, reverse=True)
for file_path in south_candidates:
lat, size = get_station_info(file_path)
if lat is not None and lat < -10: # South of Equator (tropics included if needed, user said South Hemisphere)
# User said "southern hemisphere". Let's use < 0.
if lat < 0:
south_files.append((file_path, size, lat))
if len(south_files) >= 5:
break
print("Selected North Stations:")
for f in north_files[:3]:
print(f"{f[0].name} (Size: {f[1]/1024:.1f}KB, Lat: {f[2]})")
print("\nSelected South Stations:")
for f in south_files[:3]:
print(f"{f[0].name} (Size: {f[1]/1024:.1f}KB, Lat: {f[2]})")
# Save selection to a file for use
with open(DATA_DIR.parent / "selected_stations.txt", "w") as f:
for item in north_files[:3] + south_files[:3]:
f.write(str(item[0]) + "\n")
if __name__ == "__main__":
select_stations()