--------------------------------------------------------------------------- Appendix A. Perl script to extract sequences matching a substring patterns --------------------------------------------------------------------------- # \Bioinformatics\Perl\My_BioEx\Extract_str.pl Royal Truman # Usage: perl Extract_str.pl # Intention: Extract into a file the portions of sequences which matches # desired patterns. These are defined in sub Str_Found # $Stem defines a file identifier in common for all input and output files. # Version: Aug 31, 2004 First programmed. $Stem = "seqs"; $File_Source = $Stem . ".fasta"; $File_Out = $Stem . "_out.fasta"; $File_Bad_Seqs = $Stem . "_bad_Seqs.fasta"; $New_Seq = ">gi"; #Identify new fasta sequences open(INFILE, $File_Source) or die "Can't open $File_Source: $!"; open(OUTFILE, ">$File_Out") or die "Can't open $File_Out: $!"; open(OUTFILE_BAD, ">$File_Bad_Seqs") or die "Can't open $File_Bad_Seqs: $!"; print OUTFILE_BAD "Sequences lacking the searched pattern\n\n"; while (){ #Assigns each line in turn to $_ if ($_ =~ /$New_Seq/) { #Start of new sequence in input file if (Str_Found ($Seq)) { #$Seq_Substr defined in sub Str_Found } } #Store extracted substrings else { print OUTFILE_BAD $Annotation; print OUTFILE_BAD $Seq; } $Annotation = $_; #Only now are these variables defined! $Seq = ""; } else { $Seq = $Seq . $_; } } #print OUTFILE "$New_Seq"; #Facilitate further processing print STDERR "\n\nFasta sequences taken from : $File_Source\n"; print STDERR "Cleaned up data stored in : $File_Out\n"; print STDERR "Non-hit sequences stored in: $File_Bad_Seqs \n\n"; sub Str_Found { #Can include multiple matches as: if (...) { }; return 1; my ($Str) = @_; #@_ passes 1 parameter into () list if ($Seq =~ /M(Q|H)IF.K.{66,70}?RGG/is) { #/i: case insensitive $Seq_Substr = $Seq; #/s: single line, use \n print OUTFILE $Annotation; print OUTFILE "$&\n\n"; #A substring matched return 1; } } --------------------------------------------------------------------------- Appendix B. Perl script to remove sequences based on annotations --------------------------------------------------------------------------- #\Bioinformatics\Perl\My_BioEx\Usable_Annot.pl Royal Truman # Usage: perl Usable_Annot.pl # Intention : Remove sequences whose annotation shows problems., defined in subroutine Bad_Annotation # Notes: Add whatever New_Seq is (usually ">gi" to the end of $File_Source file. This simplifies the code! # Acceptable sequences in <$Stem>_bad_Seqs.fasta can be manually transferred to <$Stem>_out.fasta . # $Stem defines a file identifier in common for all input and output files. Version Aug 5, 2004 $Stem = "ex"; $File_Source = $Stem . ".fasta"; $File_Out = $Stem . "_out.fasta"; $File_Bad_Seqs = $Stem . "_bad_Seqs.fasta"; $File_Log = $Stem . ".log"; $File_Virus = $Stem . "_virus.fasta"; $File_NoOrg = $Stem . "_no_org.fasta"; $New_Seq = ">gi"; #String which identifies new sequences open(INFILE, $File_Source) or die "Can't open $File_Source: $!"; open(OUTFILE, ">$File_Out") or die "Can't open $File_Out: $!"; open(OUTFILE_BAD, ">$File_Bad_Seqs") or die "Can't open $File_Bad_Seqs: $!"; print OUTFILE_BAD "Apparently these sequences cannot be used\n\n"; open(LOGVIRUS, ">>$File_Virus") or die "Can't open $File_Virus: $!"; print LOGVIRUS "Apparently viral organisms\n\n"; open(LOGNOORG, ">>$File_NoOrg") or die "Can't open $File_NoOrg: $!"; print LOGNOORG "Organisms apparently missing in the Annotation\n\n"; while () { #Assigns each line in turn to $_ if ($_ =~ /$New_Seq/) { #Start of new sequence in input file if (Bad_Annotation ($Annotation)) { #True if subroutine returns value of 1 print OUTFILE_BAD $Annotation; print OUTFILE_BAD $Seq; } else { print OUTFILE $Annotation; print OUTFILE $Seq; } $Annotation = $_; $Seq = ""; } else {$Seq = $Seq . $_; } } print OUTFILE "$New_Seq"; #Facilitate further processing print STDERR "\nFasta sequences taken from: $File_Out\n"; print STDERR "\nCleaned up data stored in: $File_Out\n"; print STDERR "\Bad annotations stored in: $File_Log \n"; print STDERR "\Bad annotation and sequences stored in: $File_Bad_Seqs \n"; sub Bad_Annotation { #Suspect annotations are identified and stored in special files my ($Str) = @_; #@_ passes 1 parameter into () list if ($Str =~ /virus/i) { #Exclude virus; i: case insensitive print LOGVIRUS "$Annotation"; return 1; #The sequence probably can't be used } if ($Str !~ /\[/) { print LOGNOORG "$Annotation"; return 1; #The sequence probably can't be used } } #Note:long is the ratio of [5] / [4]: 4.3X10-83