import os
import glob

def clean_file(file_path):
    """Read a file, remove timestamps, and return the cleaned text as a string."""
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    cleaned_lines = []
    for line in lines:
        parts = line.strip().split(',')  # Split by commas
        if len(parts) > 3:  # Ensure there are at least 4 parts (timestamps + text)
            cleaned_text = ','.join(parts[3:]).strip()  # Take everything after the third comma
            if cleaned_text:  # Only add if there's actual text
                cleaned_lines.append(cleaned_text)
    return ' '.join(cleaned_lines)  # Join into a single string for the file

def process_files(input_dir='./t3-transcripts', output_prefix='combined_'):
    """Process all .txt files in the input directory and combine into new files with headers."""
    txt_files = glob.glob(os.path.join(input_dir, '*.txt'))  # Get all .txt files
    output_files = []  # List to track output files
    current_output = []  # Accumulator for the current output file
    current_word_count = 0
    file_counter = 1
    
    for file_path in sorted(txt_files):  # Process files in sorted order
        title = os.path.splitext(os.path.basename(file_path))[0]  # Extract title, e.g., "video1" from "video1.txt"
        cleaned_text = clean_file(file_path)
        # Add five new lines before the heading
        headered_text = f"\n\n\n\n\n## {title}\n{cleaned_text}"
        words_in_text = len(headered_text.split())  # Count words in the headered text
        
        if current_word_count + words_in_text > 500000:
            # If adding this text exceeds the limit, save the current output
            output_file_name = f"{output_prefix}{file_counter}.txt"
            with open(output_file_name, 'w', encoding='utf-8') as f:
                f.write(''.join(current_output).strip())  # Concatenate directly without adding spaces
            output_files.append(output_file_name)
            current_output = [headered_text]  # Start a new accumulator with the current headered text
            current_word_count = words_in_text
            file_counter += 1
        else:
            # Add to the current output
            current_output.append(headered_text)
            current_word_count += words_in_text
    
    # Write any remaining content to the last file
    if current_output:
        output_file_name = f"{output_prefix}{file_counter}.txt"
        with open(output_file_name, 'w', encoding='utf-8') as f:
            f.write(''.join(current_output).strip())
        output_files.append(output_file_name)
    
    return output_files

if __name__ == '__main__':
    results = process_files()
    print(f"Processed files and created: {', '.join(results)}")