-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathoptimized_rsync.sh
More file actions
321 lines (264 loc) · 8.51 KB
/
optimized_rsync.sh
File metadata and controls
321 lines (264 loc) · 8.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
#!/bin/bash
# Optimized rsync for Large Scientific Data Transfers
# =====================================================
# Author: Your Name
# Purpose: Efficiently sync large datasets over long-distance SSH connections
#
# This script addresses common bottlenecks when transferring large scientific datasets
# (genomics, ML models, HDF5 files, etc.) to remote HPC systems.
#
# Key optimizations:
# - TCP buffer tuning for high-latency connections
# - SSH cipher selection for better throughput
# - Compression options for different data types
# - Progress monitoring and resume capability
set -e
# Default values
DEFAULT_BUFFER_SIZE="524288" # 512KB - good for most long-distance connections
DEFAULT_CIPHER="aes128-gcm@openssh.com" # Fast, secure cipher
DEFAULT_COMPRESSION=true
DEFAULT_THREADS=4
# Color codes for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Function to print colored output
print_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
print_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
print_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
print_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# Function to display usage
usage() {
cat << EOF
Usage: $0 [OPTIONS] SOURCE DESTINATION
Optimized rsync for large scientific data transfers over SSH.
Arguments:
SOURCE Source directory or file
DESTINATION Destination in format: [user@]host:/path
Options:
-b SIZE Buffer size in bytes (default: $DEFAULT_BUFFER_SIZE)
Common values:
- 262144 (256KB) for LAN transfers
- 524288 (512KB) for WAN transfers (default)
- 1048576 (1MB) for very long distance
- 4194304 (4MB) for intercontinental
-c CIPHER SSH cipher (default: $DEFAULT_CIPHER)
Fast options:
- aes128-gcm@openssh.com (recommended)
- aes128-ctr
- chacha20-poly1305@openssh.com
-z Enable compression (default: on for code, off for compressed data)
-n Disable compression
-p Preserve permissions and timestamps (adds -a flag)
-e PATTERN Exclude pattern (can be used multiple times)
Common excludes:
- "*.pyc" "*.pyo" "__pycache__"
- ".git" ".venv" "node_modules"
- "*.log" "wandb/"
-t THREADS Number of parallel threads (for rsync 3.2.0+)
-d Dry run - show what would be transferred
-v Verbose output
-h Show this help message
--test-speed Test connection speed before transfer
--diagnose Run network diagnostics
Examples:
# Basic optimized transfer
$0 ./my_data/ user@hpc.university.edu:/scratch/user/project/
# Transfer with maximum optimization for intercontinental connection
$0 -b 4194304 -c aes128-ctr ./large_dataset/ user@remote:/data/
# Exclude unnecessary files and use dry run
$0 -d -e "*.log" -e "wandb/" -e "__pycache__" ./project/ user@hpc:/home/user/
# Transfer compressed data without additional compression
$0 -n ./compressed_data.tar.gz user@server:/backups/
Network Tuning Tips:
1. For connections with high latency (>50ms), increase buffer size
2. For data that's already compressed (.h5, .tar.gz), disable compression with -n
3. For many small files, compression (-z) helps significantly
4. Monitor transfer with: iftop, nload, or bmon
Troubleshooting Slow Transfers:
1. Run with --diagnose to check network path
2. Try different ciphers (-c option)
3. Adjust buffer sizes based on your connection
4. Check if firewall/IDS is throttling SSH
5. Consider using Globus or bbcp for extremely large datasets
EOF
exit 0
}
# Function to test connection speed
test_speed() {
local dest=$1
print_info "Testing connection speed to $dest..."
# Extract host from destination
local host=$(echo $dest | cut -d: -f1)
# Test ping
print_info "Testing latency..."
ping -c 5 $(echo $host | cut -d@ -f2) 2>/dev/null || print_warning "Ping failed (ICMP might be blocked)"
# Test SSH speed
print_info "Testing SSH throughput..."
dd if=/dev/zero bs=1M count=10 2>/dev/null | ssh $host "cat > /dev/null" 2>&1 | grep -o "[0-9.]* [MG]B/s" || print_warning "SSH speed test failed"
}
# Function to run diagnostics
diagnose_connection() {
local dest=$1
local host=$(echo $dest | cut -d: -f1 | cut -d@ -f2)
print_info "Running network diagnostics for $host..."
# Check DNS
print_info "DNS resolution:"
nslookup $host 2>/dev/null | head -5 || host $host 2>/dev/null || print_warning "DNS lookup failed"
# Check route
print_info "Network path (first 10 hops):"
traceroute -m 10 $host 2>/dev/null | head -15 || print_warning "Traceroute failed"
# Check SSH ciphers
print_info "Available SSH ciphers:"
ssh -Q cipher 2>/dev/null | head -10 || print_warning "Cannot query SSH ciphers"
}
# Parse arguments
BUFFER_SIZE=$DEFAULT_BUFFER_SIZE
CIPHER=$DEFAULT_CIPHER
COMPRESSION="-z"
EXCLUDE_ARGS=""
DRY_RUN=""
VERBOSE=""
PRESERVE=""
THREADS=""
TEST_SPEED=false
DIAGNOSE=false
while [[ $# -gt 0 ]]; do
case $1 in
-b)
BUFFER_SIZE="$2"
shift 2
;;
-c)
CIPHER="$2"
shift 2
;;
-z)
COMPRESSION="-z"
shift
;;
-n)
COMPRESSION=""
shift
;;
-p)
PRESERVE="-a"
shift
;;
-e)
EXCLUDE_ARGS="$EXCLUDE_ARGS --exclude='$2'"
shift 2
;;
-t)
# Only works with rsync 3.2.0+
THREADS="--threads=$2"
shift 2
;;
-d)
DRY_RUN="--dry-run"
shift
;;
-v)
VERBOSE="-v"
shift
;;
--test-speed)
TEST_SPEED=true
shift
;;
--diagnose)
DIAGNOSE=true
shift
;;
-h|--help)
usage
;;
*)
break
;;
esac
done
# Check remaining arguments
if [ $# -lt 2 ]; then
print_error "Missing required arguments"
echo ""
usage
fi
SOURCE="$1"
DESTINATION="$2"
# Validate source exists
if [ ! -e "$SOURCE" ]; then
print_error "Source does not exist: $SOURCE"
exit 1
fi
# Run diagnostics if requested
if [ "$DIAGNOSE" = true ]; then
diagnose_connection "$DESTINATION"
echo ""
fi
# Test speed if requested
if [ "$TEST_SPEED" = true ]; then
test_speed "$DESTINATION"
echo ""
fi
# Build rsync command
RSYNC_CMD="rsync"
# Basic flags
RSYNC_CMD="$RSYNC_CMD $PRESERVE $VERBOSE $COMPRESSION -P"
# Add socket options for TCP buffer tuning
RSYNC_CMD="$RSYNC_CMD --sockopts=SO_SNDBUF=$BUFFER_SIZE,SO_RCVBUF=$BUFFER_SIZE"
# Add SSH options
SSH_OPTS="-c $CIPHER"
if [ ! -z "$VERBOSE" ]; then
SSH_OPTS="$SSH_OPTS -v"
fi
RSYNC_CMD="$RSYNC_CMD -e 'ssh $SSH_OPTS'"
# Add exclude patterns
if [ ! -z "$EXCLUDE_ARGS" ]; then
RSYNC_CMD="$RSYNC_CMD $EXCLUDE_ARGS"
fi
# Add threading if specified
if [ ! -z "$THREADS" ]; then
RSYNC_CMD="$RSYNC_CMD $THREADS"
fi
# Add dry run if specified
if [ ! -z "$DRY_RUN" ]; then
RSYNC_CMD="$RSYNC_CMD $DRY_RUN"
fi
# Add source and destination
RSYNC_CMD="$RSYNC_CMD '$SOURCE' '$DESTINATION'"
# Display configuration
print_info "Configuration:"
echo " Source: $SOURCE"
echo " Destination: $DESTINATION"
echo " Buffer size: $BUFFER_SIZE bytes ($(($BUFFER_SIZE / 1024))KB)"
echo " SSH cipher: $CIPHER"
echo " Compression: $([ ! -z "$COMPRESSION" ] && echo "Enabled" || echo "Disabled")"
[ ! -z "$DRY_RUN" ] && echo " Mode: DRY RUN (no files will be transferred)"
echo ""
# Show command
print_info "Executing command:"
echo " $RSYNC_CMD"
echo ""
# Execute
if [ ! -z "$DRY_RUN" ]; then
print_warning "DRY RUN MODE - No files will be transferred"
fi
eval $RSYNC_CMD
# Check exit status
if [ $? -eq 0 ]; then
print_success "Transfer completed successfully!"
else
print_error "Transfer failed with error code $?"
exit $?
fi% (base) drn