# FYI: If I say something like "move to Larry", Larry is the name of our Linux computer. #Running Trinity to assemble combined chipseq and genomeseq datasets selected for kmer coverage #Loaded and made Trinity (see https://github.com/trinityrnaseq/trinityrnaseq/wiki) #moved selected datasets into trinity folder #various scripts used for Trinity runs ./Trinity --seqType fq --left CombSeq_R1_trimmed_corrected_selected5to80.fq --right CombSeq_R2_trimmed_corrected_selected5to80.fq --CPU 48 --max_memory 100G ./Trinity --seqType fq --JM 800G --left allChipSeq_R1_trimmedcor.fastq,spurge_fil_1cor.fastq --right allChipSeq_R2_trimmedcor.fastq,spurge_fil_2cor.fastq --inchworm_cpu 32 --CPU 32 ./Trinity --seqType fq --left ~/data/raw_data/Allspurge_genomic_R1.fastq --right ~/data/raw_data/Allspurge_genomic_R1.fastq --single ~/data/raw_data/Allspurge_genomic_SE.fastq --inchworm_cpu 32 --CPU 32 --max_memory 100G #produced output Trinity_larry.fasta ./insilico_read_normalization.pl --seqType fq --JM 100G --max_cov 30 --left Patel_corrected_R1_selected5to80.fastq --right Patel_corrected_R2_selected5to80.fastq --pairs_together --PARALLEL_STATS --CPU 10 # pulling a long list of sequenced fragments from a fastq file with the custom script # upload parse.cpp (I moved file from CyVerse to my linux computer) #compile the program with the command below g++ parse.cpp # you should see an executable file called "a.out" #upload a list of sequence identifiers in the following format HWI-ST700677:123:C0MBEACXX:7:1101:10000:49547 HWI-ST700677:123:C0MBEACXX:7:1101:10000:71867 HWI-ST700677:123:C0MBEACXX:7:1101:10000:92366 HWI-ST700677:123:C0MBEACXX:7:1101:10000:97099 #run the rogram ./a.out uniq_sequence_ID_list.txt fastq_file.fastq output.fastq Here is how I processed the data. (Get the IDs of the reads that are mapped in a Linux machine using “cut -f 10 Unique_conserved_Nondorm1.txt >Unique_conserved_Nondorm1_ID” Here, Unique_conserved_Nondorm1.txt is a file you sent me showing the reads that are mapped. It is in a format like 100 0 0 0 0 0 0 0 + HWI-ST700677:123:C0MBEACXX:7:1101:10000:49547 100 0 100 DAM4scaffold 171785 62508 62608 1 100, 0, 62508, 75 0 0 0 0 0 0 0 + HWI-ST700677:123:C0MBEACXX:7:1101:10000:71867 100 25 100 CV03_9.8221.C1.Contig8590 819 558 633 1 75, 25, 558, 53 0 0 0 0 0 0 0 + HWI-ST700677:123:C0MBEACXX:7:1101:10000:92366 100 0 53 CV03_9.3473.C1.Contig3988 1075 268 321 1 53, 0, 268, 62 0 0 0 0 0 0 0 - HWI-ST700677:123:C0MBEACXX:7:1101:10000:97099 100 0 62 51274 3333 1455 1517 1 62, 38, 1455, 55 0 0 0 0 0 0 0 - HWI-ST700677:123:C0MBEACXX:7:1101:10001:14722 100 0 55 scaffold_26 561009 194490 194545 1 55, 45, 194490, Unique_conserved_Nondorm1_ID is the output file that contains the IDs of the mapped reads. It should be in a format looking like HWI-ST700677:123:C0MBEACXX:7:1101:10000:49547 HWI-ST700677:123:C0MBEACXX:7:1101:10000:71867 HWI-ST700677:123:C0MBEACXX:7:1101:10000:92366 HWI-ST700677:123:C0MBEACXX:7:1101:10000:97099 Next, Get the sequences of the mapped reads using the program attached to this email (“parse.cpp”). a. First you need to compile parse.cpp using command “g++ -c parse.exe parse.cpp” That will generate an executable program called “parse.exe” b. Then use this command to get the sequences “parse.exe Unique_conserved_Nondorm1_ID nondorm1_R1_trimmed nondorm1_R1_trimmed_mapped” Where Unique_conserved_Nondorm1_ID is the file generated in step 1, nondorm1_R1_trimmed is the original file that has all read sequences, and nondorm1_R1_trimmed_mapped is the output file that contains the sequences of the reads that have been mapped. #Running PriceTI wget -L "ftp://ftp.ncbi.nlm.nih.gov/blast/db/nr.00.tar.gz" wget -L "http://derisilab.ucsf.edu/software/price/PriceSource130506.tar.gz" cd ./PriceSource130506 make # add files using idrop # change names to fastq and fasta accordingly using file system # check file sizes using file system to make sure they match what should have been added # run program ./PriceTI -fpp ~/pathto/input_fastq_file_R1.fastq ~/pathto/input_fastq_file_R2.fastq 329 100 -icf contigs_mito.fa 1 1 5 -nc 30 -dbmax 105 -mol 25 -tol 20 -mpi 80 -target 100 2 1 1 -o output.fasta -a 16 #Using RepeatModeler and RepeatMasker. #step 1: BuildDatabase -name a1 -engine ncbi assembly.fa #step 2: RepeatModeler -database a1 -engine ncbi -pa 12 #step 3: RepeatMasker -lib a1-consensi.fa.classified assembly.fa -dir OP -pa 12 #In step 1 it first create a database from assembly. #Step 2 uses that database to create denovo repeat library from the assembly and #Step 3 finally mask repeats in the assembly using the denovo repeat library that obtained from step 2. #Load and install BBMAP wget -L "http://sourceforge.net/projects/bbmap/files/BBMap_35.82.tar.gz" #unzip file tar -xzf BBMap_35.82.tar.gz cd bbmap #install bbmap make #seqanswers website has a ton of info on the various uses for BBMAP program suite. Also, the creator of BBMAP is there and is very responsive. #Running and using iCommands #load icommands wget –L “http://www.iplantcollaborative.org/sites/default/files/irods/icommands.x86_64.tar.bz2” tar xvfj icommands.x86_64.tar.bz2 iinit data.iplantcollaborative.org iplant 1247 #upload files (The path below "/iplant/home/...." below is just an example- I copy and paste the path from the iplant datastore header) #for big files iget -VPf -N 0 -T -X my-file --lfrestart retry-file --retries 3 /iplant/home/horvathdp/Spurge_HiSeq2000/kmer_seperated_trimmed_genomic_data/BBNORM_OUT/filename.fastq.gz #or iput -VPf -N 0 -T -X my-file --lfrestart retry-file --retries 3 big_file_of_interest.fastq /iplant/home/horvathdp/Spurge_HiSeq2000/kmer_seperated_trimmed_genomic_data/BBNORM_OUT/ #for small files iget /iplant/home/horvathdp/Spurge_HiSeq2000/kmer_seperated_trimmed_genomic_data/BBNORM_OUT/small_gene_list #cleaning and correcting reads using BBMAP Cleaning and correcting #find adapters ./bbmerge.sh in1=~/data/ Allspurge_genomic_R1.fastq in2=~/data/ Allspurge_genomic_R2.fastq outa=adapters.fa #trim adapters ./bbduk.sh -Xmx1g in1=~/data/ Allspurge_genomic_R1.fastq in2=~/data/ Allspurge_genomic_R2.fastq out1=clean1.fq out2=clean2.fq ktrim=r ref=truseq_rna.fa.gz k=28 mink=12 hdist=1 ref=resources/truseq.fa.gz # if you did this correctly, you should see something like: Input is being processed as paired Started output streams: 0.036 seconds. Processing time: 14790.990 seconds. Input: 1164536920 reads 174680538000 bases. KTrimmed: 263807 reads (0.02%) 31197557 bases (0.02%) Result: 1164458998 reads (99.99%) 174649340443 bases (99.98%) Time: 14791.179 seconds. Reads Processed: 1164m 78.73k reads/sec Bases Processed: 174680m 11.81m bases/sec #check adapters again ./bbmerge.sh in1=clean1b.fq in2=clean2b.fq outa=adapters3.fa #remove any additional sequences such as 4 at the beginning ./bbduk.sh in=clean1.fq out=clean1a.fq ftl=4 #or if more complex ./bbduk.sh -Xmx1g in1=clean1.fq in2=clean2.fq out1=clean1a.fq out2=clean2a.fq ktrim=r literal=AGATCGGAAGA k=11 mm=f hdist=2 #quality trim ./bbduk.sh in=cleanR1.fq out=cleantrimmedR1.fq qtrim=r trimq=20 ./bbduk.sh in= cleanR2.fq out=cleantrimmedR2.fq qtrim=l trimq=20 # filter by length ./bbduk.sh in= cleantrimmedR1.fq out=cleantrimmedfilteredR1.fq minlen=70 ./bbduk.sh in= cleantrimmedR2.fq out= cleantrimmedfilteredR2.fq minlen=70 #repair to remove singletons caused by loss of very short reads ./repair.sh in1= cleantrimmedfilteredR1.fq in2= cleantrimmedfilteredR2.fq out1=final_R1.fq out2=final_R2.fq outs=Joelle_singles.fq #error correct ./bbnorm.sh in1=final_R1.fq in2=final_R2.fq out1=final_R1_corrected.fq out2=final_R2_corrected.fq ecc=t keepall passes=1 bits=16 prefilter #report coverage ./bbnorm.sh in1=final_R1_corrected.fq in2=final_R2_corrected.fq khist=khist.txt peaks=peaks.txt passes=1 prefilter minprob=0 minqual=0 mindepth=0 #select max kmer at 20 ./bbnorm.sh in1=final_R1_corrected.fq in2=final_R2_corrected.fq out1=final_R1_5to30.fq out2=final_R2_5to30.fq target=30 min=5 # Assembly of conserved and expressed gene space #isolate conserved, expressed, and low copy portion of the genome: #1) in cyverse used bowtie2 to ID Patel corrected set fragments with hits to the transcriptome and EST set (file RNAseqALL_and_sangerESTs.fasta). #2) in cyverse used bowtie2 to ID Patel corrected set fragments with hits to manihot genome in cyverse (file Manihot esculenta [Cassava] Manihot_esculenta_v6). #3) in cyverse used bowtie2 to ID Patel corrected set fragments with hits to ricinus genome in cyverse (file Rcommunis_119.fa) #4) in cyverse used bowtie2 to ID Patel corrected set fragments with hits to Hevea genome in cyverse (Ptrichocarpa_210.fa) #4) select and build fastq files from these hits #5) combined output from bowtie sam file comrpessed and moved to linux box #6) select hits where both frags were properly paired (those with a score of 83 or 99 in the second column) awk '$2 == "83" { print $0 }' d_hits.txt >output_dhit83file.txt awk '$2 == "99" { print $0 }' d_hits.txt >output_dhit99file.txt #7) Concatenated these two files cat output_dhit99file.txt output_dhit83file.txt >cat_dhits.txt #8) select first column cut -f1 cat_dhits.txt >cut_dhits.out #9) sort file sort cut_dhits.out >sort_dhits.out #10) generate list of unique identifiers uniq sort_dhits.out >list_of_conserved_and_expressed_hits.out #11) build fastq files using ccparse upload parse.cpp wget -L "https://github.com/mycstar/parse-map.git" g++ parse.cpp ./a.out ~/bbmap/list_of_conserved_and_expressed_hits.out ~/bbmap/Patel_corrected_chipandgenomeR1.fastq ~/bbmap/conserved_and_expressed_hits_R1.out ./a.out ~/bbmap/list_of_conserved_and_expressed_hits.out ~/bbmap/Patel_corrected_chipandgenomeR2.fastq ~/bbmap/conserved_and_expressed_hits_R2.out #repair files using bbmap ./repair.sh in1=conserved_and_expressed_hits_R1.out in2=conserved_and_expressed_hits_R2.out out1=repaired_conserved_and_expressed_hits_R1.fastq out2=repaired_conserved_and_expressed_hits_R2.fastq # 12) used bbmap to isolate those kmers from the combined files with coverage between 5X and 80X (based on the kmer coverage distribution curve generated by bbmap) ./bbnorm.sh in1=repaired_conserved_and_expressed_hits_R1.fastq in2=repaired_conserved_and_expressed_hits_R2.fastq out1=Patel_conserved_and_expressed_hits_R1_selected5to80.fastq out2=Patel_conserved_and_expressed_hits_R2_selected5to80.fastq ecc=t target=80 min=5 passes=1 prefilter=t cbits=16 minprob=0.8 # 13) Select all lowcopy sequences from the combined genomic libraries ./bbnorm.sh in1=Patel_R1.fastq in2=Patel_R2.fastq out1=1Patel_corrected_R1_selected5to80.fastq out2=1Patel_corrected_R2_selected5to80.fastq ecc=t target=80 min=5 passes=1 prefilter=t cbits=16 minprob=0.8 #14) combined with lowcopy kmers from step 1 cat 1Patel_corrected_R1_selected5to80.fastq ~/bbmap/Patel_conserved_and_expressed_hits_R1_selected5to80.fastq >PatelSelectedCombined5to80_R1.fastq cat 1Patel_corrected_R2_selected5to80.fastq ~/bbmap/Patel_conserved_and_expressed_hits_R2_selected5to80.fastq >PatelSelectedCombined5to80_R2.fastq grep 'HWI' PatelSelectedCombined5to80_R1.fastq > listSelectedCombined.txt sort listSelectedCombined.txt > listSelectedCombinedsorted.txt uniq listSelectedCombinedsorted.txt > listSelectedCombined.uniq #15) reformat the file for rebuilding fastq files using parse.ccp (note- the combined lists added about 10G more frags to low-copy library) sed 's/\s.*$//' listSelectedCombined.uniq > finallist.txt sed 's/ //' file finallist.txt1 sed 's/@//' file finallist.txt2 #16) reassemble and repair low copy, expressed, and conserved fastq files ./a.out finallist.txt ~/bbmap/Patel_corrected_chipandgenomeR1.fastq ~/data/5to80_both_selected_and_conserved_and_expressed_hits_R1.out ./a.out ~/data/listSelectedCombined.uniq ~/bbmap/Patel_corrected_chipandgenomeR2.fastq ~/data/5to80_both_selected_and_conserved_and_expressed_hits_R2.out ./repair.sh in1=~/data/5to80_both_selected_and_conserved_and_expressed_hits_R1.out in2=~/data/5to80_both_selected_and_conserved_and_expressed_hits_R2.out out1=~/data/5to80_both_selected_and_conserved_and_expressed_hits_R1.fastq out2=~/data/5to80_both_selected_and_conserved_and_expressed_hits_R2.fastq outsingle+singles.fastq #Alt.method #load related genomes from genbank: for example- GCA_001659605.1_Manihot_esculenta_v6 #1) use bbmap to identify conserved and expressed sequences ./bbmap.sh in=~/colormap/Patel_corrected_chipandgenome_merged.fastq ref=~/data/RNAseqALL_and_sangerESTs.fasta nodisk outm=expressed_matched.fq outu=not_expressed.fq maxindel=3 minid=.95 ./bbmap.sh in=~/colormap/Patel_corrected_chipandgenome_merged.fastq ref=~/data/Ricinus_genomic.fna nodisk outm=Ricinus_matched.fq outu=not_Ricinus.fq maxindel=20 minid=0.8 ./bbmap.sh in=~/colormap/Patel_corrected_chipandgenome_merged.fastq ref=~/data/Manihot_genomic.fna nodisk outm=Manihot_matched.fq outu=not_Manihot.fq maxindel=20 minid=0.8 ./bbmap.sh in=~/colormap/Patel_corrected_chipandgenome_merged.fastq ref=~/data/Hevea_genomic.fna nodisk outm=Hevea_matched.fq outu=not_Hevea.fq maxindel=20 minid=0.8 ./bbmap.sh in=~/colormap/Patel_corrected_chipandgenome_merged.fastq ref=~/data/Poplar_genomic.fna nodisk outm=Poplar_matched.fq outu=not_Poplar.fq maxindel=20 minid=0.8 #2) Select all lowcopy sequences from the combined genomic libraries ./bbnorm.sh in=Patel_merged.fastq out=Patel_merged_5to80.fastq ecc=t target=80 min=5 passes=1 prefilter=t cbits=16 minprob=0.8 #3) concatenate and remove duplicates for expressed and conserved files cat expressed_matched.fq Hevea_matched.fq Manihot_matched.fq Ricinus_matched.fq Patel_merged_5to80.fastq >bbmap_expressed_and_conserved.fastq #3)depuplicate with anna's program - https://github.com/beedatadriven/FASTQ_dedup perl output_uniq_sequences_based_on_id.pl -i bbmap_expressed_and_conserved.fastq -o ~/data/uniq_expressed_and_conserved.fastq ./bbnorm.sh in=~/data/uniq_expressed_and_conserved2.fastq out=uniq_expressed_and_conserved_selected5to80.fastq ecc=t target=80 min=5 passes=1 prefilter=t cbits=16 minprob=0.8 cat uniq_expressed_and_conserved_selected5to80.fastq Patel_corrected_selected5to80_merged.fastq >5to80_expressed_and_conserved_selected_alt.fastq perl output_uniq_sequences_based_on_id.pl -i 5to80_expressed_and_conserved_selected_alt.fastq -o 5to80_expressed_and_conserved_selected_alt_uniq.fastq #7) used bbmap to limit coverage to 20X max to generate the dataset for assembly ./bbnorm.sh in=~/data/5to80_expressed_and_conserved_selected_alt_uniq.fastq out=Patelselected_expressed_conserevd20norm1_alt.fastq target=20 #Assemble data in larry with trinity, abyss, and Tadpole and in cyverse with Trinity and velvet #Tadpole ./reformat.sh in1=Patelselected_expressed_conserevd20norm1.fq in2=Patelselected_expressed_conserevd20norm2.fq out=5to80combined_hits_interleaved.fastq ./tadpole.sh in=Patelselected_expressed_conserevd20norm1_alt.fastq out=contigs_5to80combined63min3.fasta k=63 mincov=3 #alt: ./tadpole.sh in=Patelselected_expressed_conserevd20norm1_alt.fastq out=contigs_5to80combined63min3alt.fasta k=63 mincov=3 #size select bioawk -c fastx '{ if(length($seq) > 200) { print ">"$name; print $seq }}' contigs_5to80combined63min3alt.fasta >contigs_5to80combined63min3alt_200plus.fasta #remove commas from header sed 's/,//g' contigs_5to80combined63min3alt_200plus.fasta >~/bbmap/contigs_5to80combined63min3alt_200plusmod.fasta #Abyss abyss-pe name=5to80combined k=64 in='~/bbmap/Patel5to80combined20norm/Patelselected_expressed_conserevd20norm1.fq ~/bbmap/Patel5to80combined20norm/Patelselected_expressed_conserevd20norm2.fq' #Trinity-larry ./Trinity --seqType fq --left ~/bbmap/Patel5to80combined20norm/Patelselected_expressed_conserevd20norm1.fq --right ~/bbmap/Patel5to80combined20norm/Patelselected_expressed_conserevd20norm1.fq --inchworm_cpu 32 --CPU 32 --max memory 150G # download L_RNA_scaffolder # here is a link to the paper, but last time I checked, the download site was broken. https://bmcgenomics.biomedcentral.com/articles/10.1186/1471-2164-14-604 # The author sent me a copy of the program file which I will include along with the custom script (see parse.cc above) for pulling out selected fastq sequences. email me (david.horvath@ars.usda.gov) and I will send you the files. #Using L_RNA_scaffolder Required: -d : the directory where the programs are in. -i : the output of transcripts alignment with BLAT. SeePSL format -j : the genome fragments fasta file which will be scaffolded and was used as the database when aligning transcript reads. Optional: -r : some fragments which you might be interesting and will not be scaffolded. The file has two columns per row. One row stand for that two fragments might be connected and should not be scaffolded. -l : the threshold of alignment length coverage (default:0.95). If one read has a hit of which length coverage was over the threshold, then this read would be filtered out. -p : the threshold of alignment identity (default: 0.9). If one alignment has an identity over the threshold, then the alignment is kept for the further analysis. -o : the directory where the output file is stored. The default output directory is equal to the program directory. -e : The maximal intron length between two exons (default: 100kb). -f : the minimal number of supporting reads (default: 1). If the number of the supporting reads for the connection is over the frequency, then this connection is reliable. #First step is to use BLAT(with options) in CyVerse to generate a psl file by blat'ing the transcripome assembly as the query file and the genome assembly as the reference. Then zipping the file and moving it to your linux box and unzipping it there. #then run the scaffolding program ./L_RNA_scaffolder.sh -d ./ -i Trinity5to80combined_as_ref_vs_velvet_assembly_as_query.psl -j TrinityLarry_5to80combined63min3.fasta #this will produce a number of output files. If I remember correctly, the one L_RNA_scaffolder.fasta is the one you want. #Identifying high copy sequences likley including most repetative elements ./bbnorm.sh in=Patel_merged.fastq out=Patel_merged_80plus.fastq ecc=t target=99999 min=80 passes=1 prefilter=t cbits=16 minprob=0.8 #Assembling Chloroplast and Mitochondrial Genomes #in iplant, use Bowtie2 to map fragments to a list of plastid genomes from related species: Chloroplasts: initial spurge assembly, hevea, castor, cassava, and jatropha Mitochondria: castor, hevea, Ziziphus, Gossypium, populus Move file to Larry #select hits where both frags were properly paired (those with a score of 83 or 99 in the second column) awk '$2 == "83" { print $0 }' mito_concatenate_out.txt >output_mito83file.txt awk '$2 == "99" { print $0 }' mito_concatenate_out.txt >output_mito99file.txt #Concatenated these two files cat output_mito83file.txt output_mito99file.txt >cat_file.out #select first column cut -f1 cat_file.out >cut_file.out #sort file sort cut_file.out >sort_file.out #generate list of unique identifiers uniq sort_file.out >list_of_hits.out #build fastq files using ccparse upload parse.cpp g++ parse.cpp ./a.out ~/bbmap/list_uniq_mito_hits.txt ~/bbmap/Patel_corrected_chipandgenomeR1.fastq ~/bbmap/Patel_mapped_mito_hits_R1.fastq ./a.out ~/bbmap/list_uniq_mito_hits.txt ~/bbmap/Patel_corrected_chipandgenomeR2.fastq ~/bbmap/Patel_mapped_mito_hits_R2.fastq #repair files using bbmap ./repair.sh in1=Patel_mapped_mito_hits_R1a.fastq in2=Patel_mapped_mito_hits_R2a.fastq out1=repaired_Patel_mapped_mito_hits_R1.fastq out2=repaired_Patel_mapped_mito_hits_R2.fastq #interleave files ./reformat.sh in1=repaired_Patel_mapped_mito_hits_R1a.fastq in2=repaired_Patel_mapped_mito_hits_R2a.fastq out=mito_hits_interleaved.fastq #assemble with tadpole ./tadpole.sh in=interleaved_mito_hits out=contigs_mito.fa #fill gaps with PriceTI ./PriceTI -fpp Patel_corrected_chipandgenomeR1.fastq Patel_corrected_chipandgenomeR2.fastq 329 100 -icf contigs_mito.fa 1 1 5 -nc 30 -dbmax 150 -mol 55 -tol 20 -mpi 80 -target 100 2 1 1 -o mito_ext_final.fasta -a 16 #This worked great for the chloroplast, but only gave 10 contigs (128K total) with significant homology to mitochondria from euphorbs and the assembly was heavily contaminated with chloroplast sequences. #for motochondria, remove chloroplast sequences prior to running PiceTI #creat file that lacks cloroplast sequences since they can mess up a mitochondrial assembly ./bbduk.sh in=Patel_corrected_chipandgenome_merged.fastq outm=chloro.fq outu=no_chloro_interleaved.fastq ref=Chloroplasts_from_spurge_corrected_oriented.fasta k=31 ./bbmap.sh in=cno_chloro_interleaved.fastq ref=mitochondria_various_species.fasta nodisk outm=mito_matched.fq outu=no_mito.fq maxindel=20 minid=0.7 ./tadpole.sh in=mito_matched.fq out=non_chloro_mito_contigs.fasta # map the resulting contigs to euphorb mitochondria with blast and remove poorly matching contigs. # deinterleave the chloroplast depleted illumina sequences ./reformat in=no_chloro.fq out1=no_chloro_R1.fastq out2=no_chloro_R2.fastq ./PriceTI -fpp ~/data/no_chloro_R1.fastq ~/data/no_chloro_R2.fastq 329 100 -icf ~/data/mito_top_longest_list.fasta 1 1 5 -nc 30 -dbmax 150 -mol 55 -tol 20 -mpi 80 -target 100 2 1 1 -o ~/data/mito_longest_ext.fasta -a 48 #SNP Analysis #load and install samtools #go to http://www.htslib.org/download/ wget -L "https://github.com/samtools/samtools/releases/download/1.6/samtools-1.6.tar.bz2" tar -xvjf https://github.com/samtools/samtools/releases/download/1.6/samtools-1.6.tar.bz2 cd samtools-1.6 ./configure --prefix=/where/to/install #(ie. /usr/local/bin/) make sudo make install #repeat for bcftools-1.6 and htslib-1.6 #You may need to add some of these programs to your path. To do so type for example: which bcftools /usr/local/bin/bcftools #Type the following and hit enter: export PATH=/usr/local/bin/bcftools:$PATH # you can check for success just by typing: bcftools #that should give you info on the program that would be called bt that command #duplicate-marked BAM files were moved from CyVerse to our in-house linux system (Larry) # these were run through samtools mpileup and then through bcftools to generate a vcf file with the following commands: #index the reference file samtools faidx Patel_Trinity_L_RNA_scaffolded.fasta #generate the bcf file samtools mpileup -g -O -s -t 'DP,DPR' -f Patel_Trinity_L_RNA_scaffolded.fasta -b dorm1_marked_duplicates.bam dorm2_marked_duplicates.bam nondorm1_marked_duplicates.bam nondorm2_marked_duplicates.bam > myraw_all2.bcf #note, newer versions of samtools does not use the same options as the above (which was run on samtools 1.3.1) #call variants bcftools call -c myraw_all2.bcf > myvar_all.bcf #filter and write vcf bcftools view myvar_all.bcf |/usr/share/samtools/vcfutils.pl varFilter -Q 25 -d 10 -D 10000 > myvarfinal_all.vcf #write SNP stats vcf-stats myvarfinal_all.vcf > stats.txt grep 'snp_count' stats.txt > snp_count.txt awk '{sum+=$1} END {print sum}' snp_count.txt #filtering with vcftools vcftools --gzvcf ~/data/spurge_bams/dup_marked_bams/myvarfinal_all.vcf.gz --max-missing 0.5 --mac 3 --minQ 30 --recode --recode-INFO-all --out rawfilterd1 vcftools --vcf raw.filtered1.recode.vcf --minDP 3 --recode --recode-INFO-all --out raw.filtered2 #perl script to pull selected sequences from a fasta file. perl -e ' ($id,$fasta)=@ARGV; open(ID,$id); while () { s/\r?\n//; /^>?(\S+)/; $ids{$1}++; } $num_ids = keys %ids; open(F, $fasta); $s_read = $s_wrote = $print_it = 0; while () { if (/^>(\S+)/) { $s_read++; if ($ids{$1}) { $s_wrote++; $print_it = 1; delete $ids{$1} } else { $print_it = 0 } }; if ($print_it) { print $_ } }; END { warn "Searched $s_read FASTA records.\nFound $s_wrote IDs out of $num_ids in the ID list.\n" } ' identifier_list.txt target_file.fasta > output.fasta