-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathcutgff.sh
More file actions
executable file
·90 lines (76 loc) · 3.03 KB
/
cutgff.sh
File metadata and controls
executable file
·90 lines (76 loc) · 3.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/bin/bash
usage(){
echo "
Written by Brian Bushnell
Last modified October 15, 2019
Description: Cuts out features defined by a gff file, and writes them
to a new fasta. Features are output in their sense strand.
Usage: cutgff.sh in=<fna file> gff=<gff file> out=<fna file>
in= is optional, and gff filenames will be automaitically assumed based on
the fasta name if not specified. This allows running on multiple files:
cutgff.sh types=rRNA out=16S.fa minlen=1440 maxlen=1620 attributes=16S bacteria/*.fna.gz
File Parameters:
in=<file> Input FNA (fasta) file.
gff=<file> Input GFF file (optional).
out=<file> Output FNA file.
Other Parameters:
types=CDS Types of features to cut.
invert=false Invert selection: rather outputting the features,
mask them with Ns in the original sequences.
attributes= A comma-delimited list of strings. If present, one of
these strings must be in the gff line attributes.
bannedattributes= A comma-delimited list of banned strings.
banpartial=t Ignore lines with 'partial=true' in attributes.
minlen=1 Ignore lines shorter than this.
maxlen=2147483647 Ignore lines longer than this.
renamebytaxid=f Rename sequences with their taxID. Input sequences
must be named appropriately, e.g. in NCBI format.
taxmode=accession Valid modes are:
accession: Sequence names must start with an accession.
gi: Seqence names must start with gi|number
taxid: Sequence names must start with tid|number
header: Best effort for various header formats.
requirepresent=t Crash if a taxID cannot be found for a sequence.
oneperfile=f Only output one sequence per file.
align=f Align ribosomal sequences to consensus (if available);
discard those with low identity, and flip those
annotated on the wrong strand.
maxns=-1 If non-negative, ignore features with more than this many
undefined bases (Ns or IUPAC symbols).
maxnfraction=-1.0 If non-negative, ignore features with more than this
fraction of undefined bases (Ns or IUPAC symbols).
Should be 0.0 to 1.0.
"
}
if [ -z "$1" ] || [ "$1" = "-h" ] || [ "$1" = "--help" ]; then
usage
exit
fi
resolveSymlinks(){
SCRIPT="$(cd "$(dirname "$0")" && pwd)/$(basename "$0")"
while [ -h "$SCRIPT" ]; do
DIR="$(dirname "$SCRIPT")"
SCRIPT="$(readlink "$SCRIPT")"
[ "${SCRIPT#/}" = "$SCRIPT" ] && SCRIPT="$DIR/$SCRIPT"
done
DIR="$(cd "$(dirname "$SCRIPT")" && pwd)"
if [ -f "$DIR/bbtools.jar" ]; then
CP="$DIR/bbtools.jar"
else
CP="$DIR/current/"
fi
}
setEnv(){
. "$DIR/javasetup.sh"
. "$DIR/memdetect.sh"
parseJavaArgs "--xmx=200m" "--xms=200m" "--mode=fixed" "$@"
setEnvironment
}
launch() {
CMD="java $EA $EOOM $SIMD $XMX $XMS -cp $CP gff.CutGff $@"
echo "$CMD" >&2
eval $CMD
}
resolveSymlinks
setEnv "$@"
launch "$@"