-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclaude-parse.py
More file actions
147 lines (121 loc) · 6.02 KB
/
Copy pathclaude-parse.py
File metadata and controls
147 lines (121 loc) · 6.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
"""
split_csv.py
------------
Splits a CSV file into multiple chunks, each under a specified size limit.
The original file is never modified — all output files are written to a
separate directory.
Usage:
python claude-parse.py # uses defaults below
python claude-parse.py my.csv # positional input file
python claude-parse.py --input my.csv # flag input file
python claude-parse.py --max-mb 2 --output-dir chunks/
"""
import argparse
import os
import sys
from pathlib import Path
import pandas as pd
# ── Defaults ────────────────────────────────────────────────────────────────
DEFAULT_INPUT = "staff-chat-messages-filtered.csv"
DEFAULT_MAX_MB = 2
# ─────────────────────────────────────────────────────────────────────────────
def split_csv(input_path: str, max_mb: float, output_dir: str) -> None:
max_bytes = int(max_mb * 1024 * 1024)
# ── Validate input ───────────────────────────────────────────────────────
if not os.path.isfile(input_path):
sys.exit(f"Error: input file not found: {input_path!r}")
total_bytes = os.path.getsize(input_path)
print(f"Input file : {input_path}")
print(f"File size : {total_bytes / 1024 / 1024:.2f} MB")
print(f"Chunk limit : {max_mb} MB ({max_bytes:,} bytes)")
if total_bytes <= max_bytes:
print("File is already within the size limit — no splitting needed.")
return
# ── Prepare output directory ─────────────────────────────────────────────
os.makedirs(output_dir, exist_ok=True)
# Resolve absolute paths so we can check the output isn't the same file
input_abs = os.path.realpath(input_path)
output_abs = os.path.realpath(output_dir)
base_name = os.path.splitext(os.path.basename(input_path))[0]
# ── Read full CSV ────────────────────────────────────────────────────────
print("\nReading CSV …")
df = pd.read_csv(input_path, dtype=str, keep_default_na=False)
total_rows = len(df)
print(f"Total rows : {total_rows:,} | Columns: {len(df.columns)}")
# ── Estimate rows-per-chunk via a binary-search approach ─────────────────
# Write a temporary sample to measure bytes-per-row, then refine.
chunk_index = 1
row_start = 0
files_written = []
while row_start < total_rows:
# Initial guess: proportional slice
remaining_rows = total_rows - row_start
remaining_bytes = total_bytes * (remaining_rows / total_rows)
guess_rows = max(1, int(remaining_rows * max_bytes / remaining_bytes))
# Binary-search to find the largest slice that fits under max_bytes
lo, hi = 1, min(guess_rows * 2, remaining_rows)
while lo < hi:
mid = (lo + hi + 1) // 2
chunk_df = df.iloc[row_start : row_start + mid]
chunk_csv = chunk_df.to_csv(index=False)
chunk_size = len(chunk_csv.encode("utf-8"))
if chunk_size <= max_bytes:
lo = mid
else:
hi = mid - 1
# lo is now the max number of rows that fit
chunk_df = df.iloc[row_start : row_start + lo]
out_path = os.path.join(output_dir, f"{base_name}_part{chunk_index:03d}.csv")
# Safety check: never overwrite the original
if os.path.realpath(out_path) == input_abs:
sys.exit("Error: output path would overwrite the input file. "
"Choose a different --output-dir.")
chunk_df.to_csv(out_path, index=False)
actual_size = os.path.getsize(out_path)
print(f" {os.path.basename(out_path)} "
f"rows {row_start+1:,}–{row_start+lo:,} "
f"({actual_size / 1024 / 1024:.2f} MB)")
files_written.append(out_path)
row_start += lo
chunk_index += 1
# ── Summary ──────────────────────────────────────────────────────────────
print(f"\nDone. {len(files_written)} file(s) written to: {output_dir}/")
# ── CLI ──────────────────────────────────────────────────────────────────────
def main() -> None:
parser = argparse.ArgumentParser(
description="Split a CSV file into chunks no larger than MAX_MB each."
)
parser.add_argument(
"input_file",
nargs="?",
default=None,
help="Path to the input CSV file (positional)"
)
parser.add_argument(
"--input", "-i",
default=DEFAULT_INPUT,
help=f"Path to the input CSV file (default: {DEFAULT_INPUT!r})"
)
parser.add_argument(
"--max-mb", "-m",
type=float,
default=DEFAULT_MAX_MB,
dest="max_mb",
help=f"Maximum size of each output file in MB (default: {DEFAULT_MAX_MB})"
)
parser.add_argument(
"--output-dir", "-o",
default=None,
dest="output_dir",
help="Directory for the output chunk files (default: same folder as input)"
)
args = parser.parse_args()
input_path = args.input_file or args.input
output_dir = args.output_dir or str(Path(input_path).parent)
split_csv(
input_path = input_path,
max_mb = args.max_mb,
output_dir = output_dir,
)
if __name__ == "__main__":
main()