To handle larger text files (20MB with ~500k lines) efficiently, you need to optimize memory usage and processing. Here's the modified code with improvements:
import pandas as pd
import json
from openpyxl import Workbook
import re
from json import JSONDecodeError
def json_to_excel(json_file, excel_file='result.xlsx', chunk_size=10000):
# Initialize an empty list to store all data
all_data = []
# Open the file once and process it line by line
with open(json_file, 'r') as file:
current_obj = ""
line_count = 0
chunk_count = 0
for line in file:
line = line.strip()
line_count += 1
# Progress reporting
if line_count % 50000 == 0:
print(f"Processed {line_count} lines...")
if line:
current_obj += line
# Try to parse whenever we have content
try:
# Pre-process the line to fix common JSON issues
cleaned = re.sub(r'}\s*{', '}\n{', current_obj)
data = json.loads(cleaned)
if isinstance(data, dict):
all_data.append(data)
elif isinstance(data, list):
all_data.extend(data)
current_obj = ""
# Process in chunks to manage memory
if len(all_data) >= chunk_size:
process_chunk(all_data, excel_file, chunk_count)
chunk_count += 1
all_data = []
except JSONDecodeError:
# If not complete JSON, continue accumulating
continue
# Process any remaining data
if all_data:
process_chunk(all_data, excel_file, chunk_count)
print(f'Data written successfully to {excel_file}')
def process_chunk(data_chunk, excel_file, chunk_number):
"""Helper function to process data chunks"""
df = pd.json_normalize(data_chunk)
if chunk_number == 0:
# Write new file with header
df.to_excel(excel_file, index=False, engine='openpyxl')
else:
# Append to existing file without header
with pd.ExcelWriter(excel_file, mode='a', engine='openpyxl', if_sheet_exists='overlay') as writer:
df.to_excel(writer, index=False, header=False, startrow=writer.sheets['Sheet1'].max_row)
# Example usage
json_file_path = "large_file.txt"
json_to_excel(json_file_path)
Key Improvements:
-
Stream Processing:
- Reads the file line by line instead of loading entire file into memory
- Uses generator-like pattern to process data incrementally
-
Chunk Processing:
- Processes data in chunks (default 10,000 records at a time)
- Writes each chunk to Excel to free memory
-
Memory Optimization:
- Clears processed data from memory after each chunk
- Uses more efficient data structures
-
Progress Reporting:
- Prints progress every 50,000 lines so you can monitor processing
-
Error Handling:
- Better handling of malformed JSON
- More robust parsing of concatenated JSON objects
-
Performance:
- Reduced string operations
- More efficient regex usage
-
Append Mode:
- Uses Excel's append mode to write chunks sequentially
Additional Recommendations:
- For extremely large files (100MB+), consider:
# Use these parameters for very large files
json_to_excel("huge_file.txt", chunk_size=5000)
-
If you need even better performance:
- Consider using
csv
format instead of Excel for intermediate processing - Or use a database like SQLite for intermediate storage
- Consider using
For monitoring memory usage, you can add:
import psutil
print(f"Memory usage: {psutil.Process().memory_info().rss / 1024 / 1024:.2f} MB")
This modified version should handle your 20MB file with 500k lines efficiently while keeping memory usage manageable.