manipulating all files with a given extension

# power chasis
for f in directory/*.ext ; do n=`basename $f` fn=${n%.ext}; mycodehere > outdir/${fn}.newext ; done
# for example quality filter all bam files in a directory
for f in bam-uf/*.bam ; do n=`basename $f` fn=${n%.bam}; samtools view -b -q 20 -f 0x002 -F 0x004 -F 0x008 $f > bam-mq20/${fn}.q20.bam ; done

remove empty lines

cat tocleanup.txt | perl -pe 's/^\s+$//' | sed '/^$/d' > cleaned.txt

rename all files with a given extension

#can be dangerous command, therefore two steps;
#first check if done correct; just display command
for f in *.fastq.gz; do n=${f%.fastq.gz}; co="mv $f ${n}.fq.gz"; echo $co ; done
# than replac echo with eval to execute it
for f in *.fastq.gz; do n=${f%.fastq.gz}; co="mv $f ${n}.fq.gz"; eval $co ; done

copy all files with a given extension in any subdirectory to a new location

for i in **/*sort.bam; do n=`basename $i`; cp $i ../../harvest/hot/$n; done

copy content of multiple files in subdirectories into a single one and add identifier

Muaahahhahha, I'm brilliant

for i in **/*.mapedreads.txt; do d=`basename $i`; d=${d%.mapedreads.txt} ; cat $i|perl -spe 's/^/$d\t/' -- -d=$d; done

Kill all java scripts

ps -e |grep 'java'|awk '{print $1}'|xargs kill -9

Zip all files recursively in a directory, having a given extension

# this works just with the zsh
ls -d **/*|grep '.cmh$'|xargs gzip &


Reheader a bam file

The following is necessary as the samtools reheader produces a defect bam file
samtools view toreheader.bam|cat newheader.sam -|samtools view -Sb - > reheadered.bam


Extract a subset

Extract sequences from a fasta file; samtools is indeed very handy

samtools faidx dmel-short-masked.fasta 2L

print the length of sequences in a fasta file
awk '/^>/ {if (seqlen) print seqlen;print;seqlen=0;next} {seqlen+=length($0)}END{print seqlen}' my.fasta | paste - - | sed 's/>//'


split libraries for zipped fastq files

gzip -cd testinput.fq.gz | paste - - - - | awk 'BEGIN{FS="\t"}$1~/#CGATGTAT\//' | tr "\t" "\n" | gzip -c > testoutput.fq.gz

obtain reads with a given size range

e.g. reads with a length between 23 and 29nt

cat r5-16.nucs15-35.fastq|paste - - - - |perl -ane 'print if length($F[2])>=23 && length($F[2])<=29'|tr "\t" "\n"|head

subsample from fastq files

The same seed (-s100) ensures that the two files are in sync; Seqtk
seqtk sample -s100 reads_1.fastq 1045174 > reads-ss_1.fastq
seqtk sample -s100 reads_2.fastq 1045174 > reads-ss_2.fastq


replace in whole file