import pandas as pd
import sys
import os
import glob
from datetime import datetime
def find_fry9c_files(directory):
"""
Find all CSV files starting with FRY9C in the given directory and subdirectories
Args:
directory (str): Root directory to search
Returns:
list: List of file paths matching the pattern
"""
pattern = os.path.join(directory, "**", "FRY9C*.csv")
files = glob.glob(pattern, recursive=True)
return files
def extract_trading_revenue_from_file(file_path):
"""
Extract trading revenue data from a single FRY9C CSV file
Args:
file_path (str): Path to the CSV file
Returns:
list: List of dictionaries containing extracted data
"""
try:
# Read the CSV file
df = pd.read_csv(file_path)
# Extract institution name and report date
institution_row = df[df['ItemName'] == 'Institution Name']
report_date_row = df[df['ItemName'] == 'Report Date']
if institution_row.empty or report_date_row.empty:
print(f"Warning: Could not find institution name or report date in {file_path}")
return []
institution_name = institution_row['Value'].iloc[0]
report_date_raw = report_date_row['Value'].iloc[0]
# Convert report date from YYYYMMDD to a readable format
report_date_str = str(report_date_raw)
report_date = datetime.strptime(report_date_str, '%Y%m%d').strftime('%Y-%m-%d')
# Filter rows where Description starts with "Trading Revenue" or "TRADING REVENUE"
trading_revenue_mask = df['Description'].fillna('').str.startswith(('Trading Revenue', 'TRADING REVENUE'))
trading_revenue_rows = df[trading_revenue_mask].copy()
# Create the output data for this file
file_output_data = []
for _, row in trading_revenue_rows.iterrows():
# Convert value from thousands to actual USD (multiply by 1,000)
value_in_thousands = float(row['Value'])
value_in_usd = value_in_thousands * 1000
# Clean up the measure name by removing "EXPOSURES" and "(BHC CONSOLIDATED)"
measure_name = row['Description']
measure_name = measure_name.replace(' EXPOSURES', '').replace(' (BHC CONSOLIDATED)', '')
file_output_data.append({
'InstitutionName': institution_name,
'Report Date': report_date,
'Measure': measure_name,
'Value': int(value_in_usd) # Convert to int to avoid decimals
})
print(f"Extracted {len(file_output_data)} trading revenue records from {os.path.basename(file_path)}")
return file_output_data
except Exception as e:
print(f"Error processing file {file_path}: {str(e)}")
return []
def extract_trading_revenue_data(directory, output_file):
"""
Extract trading revenue data from all FRY9C CSV files in a directory and subdirectories
Args:
directory (str): Root directory to search for FRY9C files
output_file (str): Path to output CSV file
"""
try:
# Find all FRY9C CSV files
csv_files = find_fry9c_files(directory)
if not csv_files:
print(f"No FRY9C*.csv files found in directory: {directory}")
return None
print(f"Found {len(csv_files)} FRY9C CSV file(s):")
for file_path in csv_files:
print(f" - {file_path}")
# Process all files and combine data
all_output_data = []
for file_path in csv_files:
file_data = extract_trading_revenue_from_file(file_path)
all_output_data.extend(file_data)
if not all_output_data:
print("No trading revenue data found in any of the files")
return None
# Create output DataFrame
output_df = pd.DataFrame(all_output_data)
# Sort by Institution Name and Report Date for better organization
output_df = output_df.sort_values(['InstitutionName', 'Report Date', 'Measure'])
# Save to CSV
output_df.to_csv(output_file, index=False)
print(f"\nSuccessfully extracted {len(all_output_data)} total trading revenue records")
print(f"Output saved to: {output_file}")
# Display summary statistics
print(f"\nSummary:")
print(f"Total records: {len(output_df)}")
print(f"Unique institutions: {output_df['InstitutionName'].nunique()}")
print(f"Unique report dates: {output_df['Report Date'].nunique()}")
print(f"Unique measures: {output_df['Measure'].nunique()}")
# Display preview of extracted data
print("\nPreview of extracted data:")
print(output_df.head(10).to_string(index=False))
return output_df
except Exception as e:
print(f"Error processing directory: {str(e)}")
return None
def main():
"""
Main function to run the extraction script
"""
if len(sys.argv) != 3:
print("Usage: python trading_revenue_extractor.py <directory> <output_file.csv>")
print("Example: python trading_revenue_extractor.py ./data/ trading_revenue_output.csv")
print("The script will search for all FRY9C*.csv files in the directory and subdirectories")
sys.exit(1)
directory = sys.argv[1]
output_file = sys.argv[2]
result = extract_trading_revenue_data(directory, output_file)
if result is not None:
print(f"\n✓ Processing completed successfully!")
else:
print(f"\n✗ Processing failed!")
sys.exit(1)
if __name__ == "__main__":
main()