To handle larger text files (20MB with ~500k lines) efficiently, you need to optimize memory usage and processing. Here's the modified code with improvements:

import pandas as pd
import json
from openpyxl import Workbook
import re
from json import JSONDecodeError

def json_to_excel(json_file, excel_file='result.xlsx', chunk_size=10000):
    # Initialize an empty list to store all data
    all_data = []

    # Open the file once and process it line by line
    with open(json_file, 'r') as file:
        current_obj = ""
        line_count = 0
        chunk_count = 0

        for line in file:
            line = line.strip()
            line_count += 1

            # Progress reporting
            if line_count % 50000 == 0:
                print(f"Processed {line_count} lines...")

            if line:
                current_obj += line
                # Try to parse whenever we have content
                try:
                    # Pre-process the line to fix common JSON issues
                    cleaned = re.sub(r'}\s*{', '}\n{', current_obj)
                    data = json.loads(cleaned)

                    if isinstance(data, dict):
                        all_data.append(data)
                    elif isinstance(data, list):
                        all_data.extend(data)

                    current_obj = ""

                    # Process in chunks to manage memory
                    if len(all_data) >= chunk_size:
                        process_chunk(all_data, excel_file, chunk_count)
                        chunk_count += 1
                        all_data = []

                except JSONDecodeError:
                    # If not complete JSON, continue accumulating
                    continue

    # Process any remaining data
    if all_data:
        process_chunk(all_data, excel_file, chunk_count)

    print(f'Data written successfully to {excel_file}')

def process_chunk(data_chunk, excel_file, chunk_number):
    """Helper function to process data chunks"""
    df = pd.json_normalize(data_chunk)

    if chunk_number == 0:
        # Write new file with header
        df.to_excel(excel_file, index=False, engine='openpyxl')
    else:
        # Append to existing file without header
        with pd.ExcelWriter(excel_file, mode='a', engine='openpyxl', if_sheet_exists='overlay') as writer:
            df.to_excel(writer, index=False, header=False, startrow=writer.sheets['Sheet1'].max_row)

# Example usage
json_file_path = "large_file.txt"
json_to_excel(json_file_path)

Key Improvements:

  1. Stream Processing:

    • Reads the file line by line instead of loading entire file into memory
    • Uses generator-like pattern to process data incrementally
  2. Chunk Processing:

    • Processes data in chunks (default 10,000 records at a time)
    • Writes each chunk to Excel to free memory
  3. Memory Optimization:

    • Clears processed data from memory after each chunk
    • Uses more efficient data structures
  4. Progress Reporting:

    • Prints progress every 50,000 lines so you can monitor processing
  5. Error Handling:

    • Better handling of malformed JSON
    • More robust parsing of concatenated JSON objects
  6. Performance:

    • Reduced string operations
    • More efficient regex usage
  7. Append Mode:

    • Uses Excel's append mode to write chunks sequentially

Additional Recommendations:

  1. For extremely large files (100MB+), consider:
# Use these parameters for very large files
   json_to_excel("huge_file.txt", chunk_size=5000)
  1. If you need even better performance:

    • Consider using csv format instead of Excel for intermediate processing
    • Or use a database like SQLite for intermediate storage
  2. For monitoring memory usage, you can add:

import psutil
   print(f"Memory usage: {psutil.Process().memory_info().rss / 1024 / 1024:.2f} MB")

This modified version should handle your 20MB file with 500k lines efficiently while keeping memory usage manageable.