#!/usr/local/bin/perl -w #This is the commandline version of the cDNA Quality Control Tool, cQC. #The tool is available as a web utility at: http://genomics.arizona.edu/software/cQC. #Comments included in the code below are intended to help make sense of implementation # details for specific stretches of code. They do not provide a high-level overview of # functionality. For a high-level overview, please read the paper, found at # http://genomics.arizona.edu/software/cQC. use Bio::SearchIO; use Getopt::Long; use strict; my $testing = 0; my $sim4_bin = "/usr/local/bin/sim4"; my $fasta_bin = "/usr/local/bin/fastacmd"; my $cdna_seq_file = "temp_cdnaseq.txt"; my $genomic_seq_file = "temp_genomicseq.txt"; my $database = ""; my $files_dir = ""; my $blast_result_file = "blast_result.bOUT"; my $blast_e_val = "1e-50"; my $one_hit_file = "one_hit.out"; my $many_hits_file = "chimeric_align.out"; #chimeric alignments my $many_hits_seqs_file = "chimeric_seqs.out"; #chimeric seqs in web cQC my $no_hits_file = "no_hits.out"; #no genomic counterpart my $no_hits_seqs_file = "no_gen_counterpart_seqs.out"; #cDNAs with no genomic counterpart my $too_short_file = "lack_end_sim.out"; #lacks end similarity my $too_short_seqs_file = "lack_end_sim_seqs.out"; my $too_short_snippets_file = "lack_end_sim_snippets.out"; #end sequence that doesn't align my $altered_seqs_file = "altered_seqs.out"; my $altered_seqs_align = "altered_align.out"; #sim4 alignment of cDNAs to genomic that don't align perfectly. my $mystery_align_file = "lack_int_sim_align.out"; #lack internal similarity my $mystery_seqs_file = "lack_int_sim_seqs.out"; my $rdna_seqs_file = "rdna_seqs.out"; #rDNA-containing cDNAs my $is_matches_file = "insertion_seq_matches.out"; #insertion sequence-containing cDNAs my $is_db = "/home2/websites/igert/software/cQC/files/BLAST_IS_DB/IS_dbIII"; my $cdna_file = ""; my $good_hit_pct = 95; my $flank_length = 2000; my $cluster_proximity = 10000; # the default for Arabadopsis my $flanking_ext_increase = 5000; my $begin_discrep=1; my $end_discrep=0; my $one_hit_altered_fasta_line = "cDNA sequence altered"; my $one_hit_perfect_fasta_line = "perfect alignment to genomic"; my $no_hit_fasta_line = "no genomic counterpart"; my $non_homol_term_fasta_line = "lacks end similarity"; my $non_homol_intern_fasta_line = "lacks internal similarity"; my $many_hit_fasta_line = "chimeric sequence"; my ($orf_nt_num, $utr5_nt_num, $utr3_nt_num) = (0,0,0); my $sim4_A = 3; my $sim4_K = 20; my $rrna_db = ""; #my $blast_bin = "/usr/local/ncbi/bin/blastall -p blastn"; my $blast_bin = "/usr/local/blast/megablast -D 2"; my $big_number = 1000000000; eval { #The rest of the code is in an eval context to assist with recovering from errors in the webscript. GetOptions( "4|sim4:s" => \$sim4_bin, "a|altered:s" => \$altered_seqs_file, "b|blast:s" => \$blast_bin, "c|cdna:s" => \$cdna_file, "d|db:s" => \$database, #genome database "e|eval:s" => \$blast_e_val, "f|fasta:s" => \$fasta_bin, "g|hitpct:i" => \$good_hit_pct, "h|nohitseqs:s" => \$no_hits_seqs_file, "i|isdb:s" => \$is_db, "j|alteredalign:s" => \$altered_seqs_align, "k|filesdir:s" => \$files_dir, "l|flank:i" => \$flank_length, "m|chimeric:s" => \$many_hits_file, "n|nohits:s" => \$no_hits_file, "o|onehit:s" => \$one_hit_file, "r|rrnadb:s" => \$rrna_db, "s|nogenseqs:s" => \$too_short_seqs_file, "t|nogen:s" => \$too_short_file, "u|nogensnip:s" => \$too_short_snippets_file, "v:i" => \$begin_discrep, "w:i" => \$end_discrep, "x|clust_prox:i" => \$cluster_proximity, "y|lackintseqs:s" => \$mystery_seqs_file, "z|lackint:s" => \$mystery_align_file, "chimeric_seq:s" => \$many_hits_seqs_file, "test:i" => \$testing, ); $files_dir .= "/" if $files_dir; open (MANYHITS, ">$files_dir$many_hits_file"); open (MANYHITS_SEQS, ">$files_dir$many_hits_seqs_file"); open (NOHITS, ">$files_dir$no_hits_file"); open (NOHITS_SEQS, ">$files_dir$no_hits_seqs_file"); open (FASTA_ERRORS, ">$files_dir"."fasta_errors"); open (IS_MATCHES, ">$files_dir$is_matches_file"); open (RDNA_SEQS, ">$files_dir$rdna_seqs_file"); open (ONEHIT, ">$files_dir$one_hit_file"); open (TOOSHORT, ">$files_dir$too_short_file"); open (TOOSHORTSEQS, ">$files_dir$too_short_seqs_file"); open (TOOSHORT_SNIPPETS, ">$files_dir$too_short_snippets_file"); open (ALTERED, ">$files_dir$altered_seqs_file"); open (ALTERED_ALIGN, ">$files_dir$altered_seqs_align"); #print header to file...may be the only thing that gets printed if no seqs are altered print ALTERED join("\t" , qw( qid utr5_inserted_bases utr5_deleted_bases utr5_insertion_gaps utr5_deletion_gaps utr5_substitutions orf_inserted_bases orf_deleted_bases orf_insertion_gaps orf_deletion_gaps orf_substitutions utr3_inserted_bases utr3_deleted_bases utr3_insertion_gaps utr3_deletion_gaps utr3_substitutions frame_shifted stop_added) ). "\n"; open (MYSTERY, ">$files_dir$mystery_align_file");#put all sequences (run through sim4)that don't match entire cDNA internally into this file open (MYSTERY_SEQ, ">$files_dir$mystery_seqs_file");#put all sequences (run through sim4)that don't match entire cDNA internally into this file if ($testing) { #print data out to files right away so you can follow them realtime $| = 1; #print right away (no buffer) ONEHIT->autoflush(1); TOOSHORT->autoflush(1); TOOSHORTSEQS->autoflush(1); TOOSHORT_SNIPPETS->autoflush(1); ALTERED->autoflush(1); ALTERED_ALIGN->autoflush(1); MYSTERY->autoflush(1); MYSTERY_SEQ->autoflush(1); } #capture the list of all cdna names and their corresponding sequence my $old_splitter = $/; $/= ">"; open (CDNA, "<$cdna_file"); my @cdnas = ; shift @cdnas; #remove the first one, cause it's empty my %cdna_seqs; my %untouched_cdna_seqs; my %orig_cdna_order; my @not_hit_cdnas; my $i=0; #get pairs of id/sequence foreach (@cdnas) {#for each sequence (fasta format) #capture first line into $id, the rest into $seq (ignoring the ">" at the end) #my ($id, $seq) = ( $_ =~ m/(\S+?)(?:\n|(?:(?: |\t).*?\n))(.*?)>?/s ) ; my ($id, $seq) = ( $_ =~ m/(\S+?)(?:(?:\015|\012)+|(?:(?: |\t).*?(?:\015|\012)+))(.*?)(>|$)/s ) ; #remove whitespace (includes newlines, carriage returns, etc.) $seq =~ s/\s//sg; #remove poly-A tail $seq .= "A"; #this is put here so the regex below will match something. It'll be removed by that regex $seq =~ s/(a+.{0,3}a*)$//gi; #replace polyA with nothing (can have up to three bps that are not A's) #remove poly-Ns at the end of the sequence $seq .= "N"; #this is put here so the regex below will match something. It'll be removed by that regex $seq =~ s/(n+.{0,3}n*)$//gi; #replace polyN with nothing (can have up to three bps that are not N's) $cdna_seqs{$id} = $seq; $untouched_cdna_seqs{$id} = 0; $orig_cdna_order{$id} = $i; $i++; } $/ = $old_splitter; # back to the original file splitter...for future reading #figure out which cdna seqs contain rrna matches. Note them and remove them my $rna_cmd ="$blast_bin -F F -d $rrna_db -i $cdna_file -b 1 -v 1 -e 1e-50 -m 8"; my $blastall_result = `$rna_cmd`; #put output from blast into blastall_result my @blast_array = split(/\n/,$blastall_result);#split blast result by newline my %cdnas_with_rrna; foreach my $row (@blast_array) { my ($blast_name) = ($row =~ m/(.+?)\s/); #dump names with hits into $blast_name $cdnas_with_rrna{$blast_name} = 1; #create deduped array (as hash) of all cdnas with rrna } foreach my $id (keys %cdnas_with_rrna) { print RDNA_SEQS ">$id\n$cdna_seqs{$id}\n"; delete $cdna_seqs{$id}; } #figure out which cdna seqs contain insertion sequences. Note them my $is_cmd ="$blast_bin -F F -d $is_db -i $cdna_file -b 1 -v 1 -e 1e-10 -m 8"; my $is_blastall_result = `$is_cmd`; #put output from blast into blastall_result my @is_blast_array = split(/\n/,$is_blastall_result);#split blast result by newline foreach my $row (@is_blast_array) { my ($blast_name) = ($row =~ m/(.+?)\s/); #dump names with hits into $blast_name print IS_MATCHES "$blast_name\n"; } #Blast cdna sequences against the blast db. Finds best matches in genomic for each cdna #For each cdna, pick the good (hit (if it's possible and there's only one) and use it to perform QC. my $blast_cmd ="$blast_bin -a 1 -d $database -i $cdna_file -o $files_dir$blast_result_file -b 2 -v 2 -e $blast_e_val"; my $res = `$blast_cmd`; my $srchio = new Bio::SearchIO ('-format' => 'blast', '-file' => "$files_dir$blast_result_file"); # Now look at the results for each Query while (my $res = $srchio->next_result) { my ($good_hit_count, $first_good_hit) = test_hits($res); next unless $res->query_name; delete($untouched_cdna_seqs{$res->query_name}); next unless $cdna_seqs{$res->query_name}; #If we removed it during rrna processing, don't bother with the blast result #...and pass the result on to the appropriate function, # according to the # of good hits if ($good_hit_count == 0) { my $qid = $res->query_name; push (@not_hit_cdnas, $qid); } elsif ($good_hit_count == 1) { one_good_hit($res, $first_good_hit); } else { # $good_hit_count > 1 many_good_hits($res); } } #Wrap-up: note all the sequences that didn't have hits, and deal with them. #Print out the whole list of un-hit cdnas, along with alteration stats foreach my $id ( keys %untouched_cdna_seqs ) { next unless $id; next unless $cdna_seqs{$id}; push (@not_hit_cdnas, $id); } #sort by original order @not_hit_cdnas = sort { $orig_cdna_order{$a} <=> $orig_cdna_order{$b}} @not_hit_cdnas; foreach my $id (@not_hit_cdnas) { print NOHITS "$id \n\n"; print NOHITS_SEQS ">$id $no_hit_fasta_line\n$cdna_seqs{$id}\n"; } print ALTERED "Total number of nt in 5'UTRs of final set = $utr5_nt_num\n"; print ALTERED "Total number of nt in ORFs of final set = $orf_nt_num\n"; print ALTERED "Total number of nt in 3'UTRs of final set = $utr3_nt_num\n"; # . # . # . #That's the end of the main program. Subroutines called in that program are below # . # . # . # ------------------ # # Subroutines # # ------------------ # # #################### # test_hits # in: a bioperl ResultI object representing the hits for one cdna sequence # out: # of good hits (above the threshold), and the first one found (a HitI object) # # The first hit found is returned in case of >=1 hit, but is only used by the calling # function in the case that exactly one hit is found # #################### sub test_hits { my $res = $_[0]; my $good_hit_count=0; my $good_hit ; my $first_good_hit; while (my $hit = $res->next_hit) { $good_hit = undef ; while (my $hsp = $hit->next_hsp) { my $hspPct = $hsp->frac_identical() * 100; if ($hspPct >= $good_hit_pct) { $good_hit = $hit; $first_good_hit ||= $hit ; #assign first_good_hit if it isn't already assigned last; } } $good_hit_count++ if $good_hit; } return ($good_hit_count, $first_good_hit); } # #################### # one_good_hit # in: a bioperl ResultI object for one cdna and a single HitI object from the set found in that Result # out: nothing # # This gets called if there's one good hit. It handles the job of matching up the cdna # sequence to genomic (clustering to find the bounds of the genomic sequence, then using # fasta and sim4), and storing the results. In clustering, may identify problem in sequence, # which will cause the sequence to be handled as a chimera # #################### sub one_good_hit { my ($result, $hit) = @_; my $qid = $result->query_name; my $cdna_number = $result->query_name; if (! $cdna_seqs{$qid} ) { #must have been deleted in the rrna clearing (based on testing) return; } my $hitname = $hit->name; $hitname =~ /\|(.+?)\|/ || $hitname =~ /\|(.*)/; $hitname = $1; #trimmed the "name" down to just the refernce # (no pipes, etc) my $lowest_good_pos = $big_number; my $highest_good_pos = -1; my $blast_strand; #will store a -1 or +1, depending on the alignment orientation of the first "good" hsp $hit->rewind; #we've already run through all hsps...reset so we can do it again while (my $hsp = $hit->next_hsp) { my $hspPct = $hsp->frac_identical() * 100; if ( $hspPct >= $good_hit_pct) { #reorient genomic sequence so that cDNA and genomic sequence are in #same direction when input into sim4 (blast strand will be +1 or -1 #and fastacmd strand will be 1 (plus/top) or 2 (minus/bottom)). Do this #otherwise sim4 reorients the cDNA to match the genomic (!!!) my $query_strand = $hsp->strand; my $subject_strand = $hsp->hit->strand; my $hsp_strand = $query_strand * $subject_strand ; my $hspSStart = $hsp->hit->start ; my $hspSEnd = $hsp->hit->end ; $blast_strand = $hsp_strand if !$blast_strand; $lowest_good_pos = $hspSStart if ($hspSStart<$lowest_good_pos); $highest_good_pos = $hspSEnd if ($hspSEnd > $highest_good_pos); } } #if two totally unrelated regions of the same chromosome get counted in the same hit, # they should be counted as two separate hits my @hsps_cluster_result = test_hsps_clustering($hit); my $clust_res = shift(@hsps_cluster_result); if ($clust_res == 1 ) { #just keep going; } elsif ($clust_res == 2 ) { #There are 2+ clusters, but they all overlap on cDNA, so # the function returns the boundaries of the largest genomic coverage by any one of the clusters ($lowest_good_pos,$highest_good_pos) = @hsps_cluster_result; #keep going; } else { #There are 2+ clusters, and at least two of them do not overlap on cDNA...so go to many hits print_many_hits ($result); return; } #convert BLAST strand designation to fastacmd strand designation my $fastacmd_strand = 1 if ($blast_strand > 0); $fastacmd_strand = 2 if ($blast_strand < 0); $lowest_good_pos -= $flank_length; $lowest_good_pos = 0 if ($lowest_good_pos < 0); $highest_good_pos += $flank_length; #fastacmd to pull the correct range of genomic sequence from the db, for use by sim4 my $cmd = "$fasta_bin -d $database -p F \\ -s $hitname -S $fastacmd_strand -L $lowest_good_pos,$highest_good_pos"; #-S (strand) default(top)=1, bottom=2 my $fasta_result = `$cmd`; if (!$fasta_result) { #there was an error running this fasta_cmd print FASTA_ERRORS "$cmd\n\n"; return; } open (TEMPSEQ, ">$files_dir$cdna_seq_file"); print TEMPSEQ ">$qid\n$cdna_seqs{$qid}"; close TEMPSEQ; open (TEMPSEQ, ">$files_dir$genomic_seq_file"); print TEMPSEQ "$fasta_result"; close TEMPSEQ; my $gen_length = length($fasta_result); my $sim4result = `$sim4_bin $files_dir$cdna_seq_file $files_dir$genomic_seq_file A=$sim4_A, K=$sim4_K`; process_sim4($sim4result, $cdna_seqs{$qid}, $fasta_result, $qid, $hitname, $fastacmd_strand, $lowest_good_pos, $highest_good_pos); `rm $files_dir$cdna_seq_file`; `rm $files_dir$genomic_seq_file`; } # #################### # many_good_hits # in: a bioperl ResultI object for one cdna # out: nothing # # The current cdna seems to have chimeric sequence. # If this is true, note it and move to next sequence # Otherwise, redirect to the one-good-hit process. # #################### sub many_good_hits { my $result = $_[0]; #first argument (result) gets copied into local variable my $qid = $result->query_name; #if the two hits had an overlap, we pass the best (first) hit off to one_good_hit # then "return"...which shortcircuits the rest of the code, returning to the calling function if ( hits_overlap($result) ){ one_good_hit ($result, ($result->hits)[0]); } else { print_many_hits ($result); } } # #################### # print_many_hits # in: a bioperl ResultI object for one cdna # out: nothing # # Print info about a chimeric sequence # #################### sub print_many_hits { my $result = $_[0]; my $qid = $result->query_name; print MANYHITS_SEQS ">$qid $many_hit_fasta_line\n$cdna_seqs{$qid}\n"; print MANYHITS "----------\ncdna name: $qid \n\n"; foreach my $hit ($result->hits) { my $hitname = $hit->name; my $acc = $hit->accession; my $dsc = $hit->description; my $printed_hitname = 0; $hit->rewind; while (my $hsp = $hit->next_hsp) { my $hspPct = $hsp->frac_identical() * 100; if ($hspPct >= $good_hit_pct) { if (!$printed_hitname) { print MANYHITS "\t$acc\n"; $printed_hitname = 1; } print MANYHITS "Query:". $hsp->query->start." - "; #print start and stop of query--do the 100% print MANYHITS $hsp->query->end."\n"; #matches overlap? my $hit_strand = $hsp->strand * $hsp->hit->strand; if ($hit_strand == 1) { print MANYHITS "Hit:". $hsp->hit->start." - "; #print start and stop of query--do the 100% print MANYHITS $hsp->hit->end."\n"; #matches overlap? } else { print MANYHITS "Hit:". $hsp->hit->end." - "; #print start and stop of query--do the 100% print MANYHITS $hsp->hit->start."\n"; #matches overlap? } print MANYHITS $hsp->query_string."\n"; print MANYHITS $hsp->homology_string."\n"; print MANYHITS $hsp->hit_string."\n\n"; } } } } # #################### # process_sim4 # in: # $sim4result: the standard output of a sim4 run on one cdna seq and one genomic region sequence # $cdna_seq: the cdna sequence # $genomic_seq: the sequence from the best matching genomic region # $qid: the name of the cdna sequence # $hitname: the name of the hit genomic sequence (from the blast) # $fastacmd_strand: a 1 or a 2, depending on orientation of blast match to genomic seq in db # $lowest_good_pos: beginning of the range that was used to fetch genomic (used in case genomic must be expanded) # $highest_good_pos end of the range that was used to fetch genomic (used in case genomic must be expanded) # # out: nothing # # Given a sim4 alignment, check to see if it covers the entire cdna. # - if it does, then print out the altered sequence (and gather stats) # - if it doesn't, hand off to a function that will try a longer stretch of genomic # #################### sub process_sim4 { my ($sim4result, $cdna_seq, $genomic_seq, $qid, $hitname, $fastacmd_strand, $lowest_good_pos, $highest_good_pos) = @_; my ($cdna_length) = ($sim4result =~ m/seq1.*?(\d+)\sbp/); my $sim4result_orig = $sim4result; $sim4result =~ s/(seq\d.*?\n)//g; #trim off the info preceeding the exon position info my @exons = split(/(?<=%)\s+(?:->|<-)/,$sim4result); #split on -> or <- $genomic_seq =~ s/^.*?\n//; #remove the first line (gene id, etc.) $genomic_seq =~ s/\n//g; #remove all newlines (compact string to just bases) my $fixed_cdna_seq = ""; foreach my $exon_string (@exons) { my ($start, $stop, $pct) = ($exon_string =~ m/\((\d+)-(\d+)\)\s+(\d+)\%/); $fixed_cdna_seq .= substr($genomic_seq,$start-1,$stop-$start+1); } #Check the starts and stops of the sim4 alignment--if doesn't start at # first cdna position or last cdna position, don't bother printing here (it's already been handled). #If it does start and stop at the right place, but already got handeled # as a mystery seq (lacks internal sim), don't bother printing. my ($too_short, $already_wrote_seq) = test_too_short($qid, $sim4result, $fixed_cdna_seq, $cdna_seq, $cdna_length,\@exons, $hitname, $fastacmd_strand, $lowest_good_pos, $highest_good_pos); if ( $too_short || $already_wrote_seq ){ return; #too_short_test was either too short, # or was not too short ...but already got handeled as a mystery seq (lacks internal sim). # In either case, we don't want to print to one_hit } #got here? must be a single good hit that covers the cdna, so count the changes # and print out the fixed sequence. my ($orf_ins, $orf_del, $orf_ins_count, $orf_del_count, $orf_sub, $utr5_ins,$utr5_del,$utr5_ins_count,$utr5_del_count,$utr5_sub, $utr3_ins,$utr3_del,$utr3_ins_count,$utr3_del_count,$utr3_sub, $frame_shifted, $stop_added, $orf_start, $orf_stop ) = count_alterations($sim4result_orig); if ($orf_sub || $orf_ins || $orf_del || $utr5_ins || $utr5_del || $utr5_sub || $utr3_ins || $utr3_del || $utr3_sub) { print ONEHIT ">$qid $one_hit_altered_fasta_line\n"; print ONEHIT "$fixed_cdna_seq\n"; print ALTERED "$qid\t"; print ALTERED join ("\t", ($utr5_ins,$utr5_del,$utr5_sub, $orf_ins, $orf_del, $orf_sub, $utr3_ins,$utr3_del,$utr3_sub, $frame_shifted?'T':'F', $stop_added?'T':'F')) ."\n"; print ALTERED_ALIGN "$qid$sim4result\n\n" ; } else { print ONEHIT ">$qid $one_hit_perfect_fasta_line\n"; print ONEHIT "$fixed_cdna_seq\n"; } } # #################### # count_alterations # in: the standard output of a sim4 run on one cdna seq and one genomic region sequence # out: an array of counts, corresponding to substitution, indel, and premature stop codons # for 5'UTR, ORF and 3'UTR # # Counts through all the positions at which cdna and genomic differ (per sim4 alignment), # and tracks substitutions, indels and early stop codons. UTR-ORF boundaries are # based on the assumption that the longest orf is the protein coding orf. # #################### sub count_alterations { my $sim4_results = $_[0]; $sim4_results =~ s/.*%\n+//; #strip off everything up to the first row of alignment. my @align_rows = split(/\n\n/, $sim4_results); my ($orf_sub, $orf_ins, $orf_del, $curr_ins, $curr_del, $frame_shifted, $stop_added) = (0,0,0,0,0,0,0,0,0,0,0,0); my ($full_hit_len, $full_hom_len, $full_query_len) = (0,0,0); my ($full_hit_str, $full_hom_str, $full_query_str, $hit_str, $hom_str, $query_str); my ($utr5_ins,$utr5_del,$utr5_curr_ins,$utr5_curr_del,$utr5_sub) = (0,0,0,0,0); my ($utr3_ins,$utr3_del,$utr3_curr_ins,$utr3_curr_del,$utr3_sub) = (0,0,0,0,0); my ($orf_del_count, $orf_ins_count, $utr5_del_count, $utr5_ins_count, $utr3_del_count, $utr3_ins_count) = (0,0,0,0,0,0,0,0); my ($first_orf_pos_ch_in_hit, $last_orf_pos_ch_in_hit) = (0,0); #NOTE: "hit" = cdna, "query"=genomic #getorf to find longest orf foreach my $row (@align_rows) { next unless $row =~ /:[\s\.]*\n(.*?)\n(.*?)\n(.*)/; my $hit_line = $1; my $hom_line = $2; my $query_line = $3; #determine the length of the left column, used to place position #s. $hit_line =~ /^(.*?\d+\s)/i; my $num_col_width = length($1); #the next 3 regexes will strip that many positions at beginning $hit_line =~ s/^.{$num_col_width}//; $full_hit_str .= $hit_line; $hom_line =~ s/^.{$num_col_width}//; $full_hom_str .= $hom_line; $query_line =~ s/^.{$num_col_width}//; $full_query_str .= $query_line; } $full_hit_len = length($full_hit_str); $full_hom_len = length($full_hom_str); $full_query_len = length($full_query_str); #remove the 3 preceeding and following characters for a ">>>...>>>" entry my $compressed_query_str = $full_query_str; $compressed_query_str =~ s/.{3}\.{3}.{3}//g; #Find the major ORF my $orf_start = 1; my $orf_stop = 1; #length($compressed_hit_str); { open (TEMPSEQ, ">$files_dir$genomic_seq_file.b"); print TEMPSEQ "$compressed_query_str"; close TEMPSEQ; my $getorf_result = `getorf -find 1 -noreverse -auto -filter $files_dir$genomic_seq_file.b 2>&1`; `rm $files_dir$genomic_seq_file.b`; my @orfs = split(/>/,$getorf_result); shift @orfs; foreach my $orf (@orfs) { my ($cur_orf_start, $cur_orf_stop) = ($orf =~ m/\[(\d+) - (\d+)\]/); if ($cur_orf_stop-$cur_orf_start > $orf_stop-$orf_start) { $orf_start = $cur_orf_start; $orf_stop = $cur_orf_stop; } } } $utr5_nt_num += $orf_start - 1; $orf_nt_num += $orf_stop - $orf_start + 1; $utr3_nt_num += length($compressed_query_str) - $orf_stop ; #do a whole lotta counting (and tracking for early stop codons inside the orf) my $hit_ch_cnt = 0; my $query_ch_cnt = 0; for my $pos (0..$full_hom_len-1) { my $hom_ch = substr($full_hom_str,$pos,1); next if $hom_ch =~ /[><.]/; #skip the ">>>...>>>" sections my $hit_ch = substr($full_hit_str,$pos,1);# if $full_hit_len>$_; $hit_ch_cnt++ if $hit_ch && $hit_ch !~ /[ -]/; my $query_ch = substr($full_query_str,$pos,1);# if $full_query_len>$_; $query_ch_cnt++ if $query_ch && $query_ch !~ /[ -]/; #5'utr region if ($query_ch_cnt < $orf_start ) { if ($hom_ch eq "-") { if ($query_ch eq " ") { $utr5_curr_ins++; } else { $utr5_curr_del++; } } else { #just finished an indel if ($utr5_curr_ins>$utr5_curr_del) { $utr5_ins_count++; $utr5_ins += $utr5_curr_ins; $utr5_curr_ins=0; } elsif ($utr5_curr_del>$utr5_curr_ins) { $utr5_del_count++; $utr5_del += $utr5_curr_del; $utr5_curr_del=0; } if ($hom_ch eq " ") { $utr5_sub++; } } next; } #3' region if ($query_ch_cnt > $orf_stop) { $last_orf_pos_ch_in_hit = $hit_ch_cnt-1 if !$last_orf_pos_ch_in_hit; if ($hom_ch eq "-") { if ($query_ch eq " ") { $utr3_curr_ins++; } else { $utr3_curr_del++; } } else { if ($utr3_curr_ins>$utr3_curr_del) { $utr3_ins_count++; $utr3_ins += $utr3_curr_ins; $utr3_curr_ins=0; } elsif ($utr3_curr_del>$utr3_curr_ins) { $utr3_del_count++; $utr3_del += $utr3_curr_del; $utr3_curr_del=0; } if ($hom_ch eq " ") { $utr3_sub++; } } next; } #in the orf $first_orf_pos_ch_in_hit = $hit_ch_cnt if !$first_orf_pos_ch_in_hit; $hit_str .= $hit_ch if ( $hom_ch eq "|" || $hom_ch eq " " || ($hom_ch eq "-" && $hit_ch ne " ")); $query_str .= $query_ch if ( $hom_ch eq "|" || $hom_ch eq " " || ($hom_ch eq "-" && $query_ch ne " ")); if ($hom_ch eq "-") { if ($query_ch eq " ") { $curr_ins++; } else { $curr_del++; } } elsif ($hom_ch eq " ") { $orf_sub++; } if ($hom_ch eq "-" || $hom_ch eq "|" || $hom_ch eq " ") { #ignore spaces between alignments (which look like ">>>...>>>") if ($query_ch_cnt >= $orf_start and $query_ch_cnt <= $orf_stop) { if ($hom_ch ne "-" ) { #reading an equality/substitution if ($curr_del%3 || $curr_ins%3) {#if we've just ended a non-3bp indel $frame_shifted=1; } $orf_del += $curr_del; $orf_ins += $curr_ins; $orf_del_count++ if $curr_del; $orf_ins_count++ if $curr_ins; $curr_del = 0; $curr_ins = 0; } elsif ( $query_ch ne " ") { # starting a deletion $frame_shifted=1 if ($curr_ins%3); #out-of-phase $orf_ins += $curr_ins; $orf_ins_count++ if $curr_ins; $curr_ins = 0; } elsif ( $query_ch eq " ") { # starting an insertion $frame_shifted=1 if ($curr_del%3); #out-of-phase $orf_del += $curr_del; $orf_del_count++ if $curr_del; $curr_del = 0; } } } } #in case there's an indel at the end of the 3' end if ($utr3_curr_ins>$utr3_curr_del) { $utr3_ins_count++; $utr3_ins += $utr3_curr_ins; } elsif ($utr3_curr_del>$utr3_curr_ins) { $utr3_del_count++; $utr3_del += $utr3_curr_del; } #in case there's an indel at the end of the 5' end if ($utr5_curr_ins>$utr5_curr_del) { $utr5_ins_count++; $utr5_ins += $utr5_curr_ins; } elsif ($utr5_curr_del>$utr5_curr_ins) { $utr5_del_count++; $utr5_del += $utr5_curr_del; } #in case there's an indel at the end of the orf if ($curr_ins>$curr_del) { $frame_shifted=1 if ($curr_ins%3); #out-of-phase $orf_ins_count++; $orf_ins += $curr_ins; } elsif ($curr_del>$curr_ins) { $frame_shifted=1 if ($curr_del%3); #out-of-phase $orf_del_count++; $orf_del += $curr_del; } my $orf_len_in_hit_str = $last_orf_pos_ch_in_hit - $first_orf_pos_ch_in_hit + 1; for ($i = 0; $i < $orf_len_in_hit_str-2; $i+=3) { my $codon = substr($hit_str,$i,3); if ($codon =~ /t(aa|ag|ga)/i) { $stop_added = 1; #changed to a stop codon } } return ($orf_ins, $orf_del, $orf_ins_count, $orf_del_count, $orf_sub, $utr5_ins,$utr5_del,$utr5_ins_count,$utr5_del_count,$utr5_sub, $utr3_ins,$utr3_del,$utr3_ins_count,$utr3_del_count,$utr3_sub, $frame_shifted, $stop_added, $orf_start, $orf_stop ); } # #################### # hits_overlap # in: a bioperl ResultI object for one cdna # out: boolean # # Test whether a series of hits overlap on cDNA # #################### sub hits_overlap { my $result = $_[0]; my @hits = $result->hits(); $hits[0]->rewind; while (my $hsp0 = $hits[0]->next_hsp) { my $hspPct0 = $hsp0->frac_identical() * 100; if ($hspPct0 >= $good_hit_pct) { $hits[1]->rewind; #need to reset the pointer to the beginning of the hits[1] hsp list while (my $hsp1 = $hits[1]->next_hsp) { my $hspPct1 = $hsp1->frac_identical() * 100; if ($hspPct1 >= $good_hit_pct) { if ( overlaps_on_cdna($hsp0, $hsp1) ) { return 1; #overlaps } } } } } return 0; #doesn't overlap } # #################### # overlaps_on_cdna # in: two bioperl hsp objects # out: a boolean indicating whether or not a pair of hsps overlap # # Test whether sequences overlap on the cDNA # #################### sub overlaps_on_cdna { my $min_overlap = 100; #there must be 100nt of overlap to throw one sequence out my ($hsp0, $hsp1) = @_; my $start0 = $hsp0->query->start; my $stop0 = $hsp0->query->end; my $start1 = $hsp1->query->start; my $stop1 = $hsp1->query->end; return overlap($start0, $stop0, $start1, $stop1, $min_overlap); } # #################### # overlap # in: two pairs of integers representing start and stop positions, and one integer indicating allowed "slop" # out: a boolean # # Test of overlap in two integer ranges (the amount of overlap must exceed a threshold before resulting in "true") # #################### sub overlap { my ($start0, $stop0, $start1, $stop1, $min_overlap) = @_; return ( ($start1>$start0 && $stop0-$start1+1 >= $min_overlap) #1 is on right side of 0 || ($stop1<$stop0 && $stop1-$start0+1 >= $min_overlap) #1 is on left side of 0 || ($start1>=$start0 && $stop1<=$stop0) #1 is contained within the range of 0 || ($start0>=$start1 && $stop0<=$stop1) #0 is contained within the range of 1 ) ; #if one of the statements is true, return a true value (1) } # #################### # test_hsps_clustering # in: a single bioperl HitI object # out: returns a list of one or more integers. The first is an integer from the set (1,2,3), with the following meaning: # 1 - all hsps in this hit form one contiguous cluster # 2 - multiple clusters, all hitting the same cdna region. In this case, the cluster # touching the the largest genomic range is chosen, and it's range is returned as # the 2nd and 3rd elements of the returned list # 3 - the hsps form multiple clusters, and they cover different parts of the cdna # # Check to see if two totally unrelated regions of the same chromosome # got counted as a single blast hit. If so, they should be counted as two # separate hits (that's the responsibility of the calling function) # #################### sub test_hsps_clustering { my $hit = $_[0]; my @hsps_first = $hit->hsps(); my @hsps; #only consider hsps with high-enough pct identity foreach (@hsps_first) { my $hspPct = $_->frac_identical() * 100; push (@hsps, $_) if $hspPct >= $good_hit_pct; } @hsps = sort { $a->hit->start <=> $b->hit->start} @hsps; my @cluster_delimiters; my $k=1; $cluster_delimiters[0] = 0; for (my $i=1; $i<=$#hsps; $i++) { if ( $hsps[$i]->hit->start > $hsps[$i-1]->hit->end + $cluster_proximity ) { #big step between two consecutive hsps #now I know where a gap starts, splitting two groups of hsps $cluster_delimiters[$k] = $i; $k++; } } $cluster_delimiters[$k] = $#hsps+1; if ( $k == 1 ) { #all part of one cluster. return 1; #it took small steps to go from one hsp to another, on genomic } else{ #the cdna matches 2+ areas of genomic. Treat it the same as in the hits_overlap case: # If all hsp clusters hit the same cdna region treat as one hit, use the best one. # (this means "return signal to treat as many_hits, along with the boundaries of # the largest genomic range touched by one of the clusters") # otherwise return signal that they don't overlap on cdna #All the hsps have been sorted according to position on genomic, so we can consider # a sequence in which each is "close" to the preceding hsp as a cluster. #Build the cdna sequence range for the ith and i+1th groups, then see if they overlap on the cdna. # If for any two sequential clusters, the answer is no, return "3", which means "2+ genomic, no overlap" # In the calling function, this will be treated as multi-hit my ($largest_cluster, $largest_cluster_best_hsp_score) = (0,0); my ($largest_cluster_cdna_start,$largest_cluster_cdna_stop) = (-1,-1); for (my $i=0; $i<$k; $i++) { my ($group1_cdna_start,$group1_cdna_stop,$group1_best_hsp_score) = ($big_number,-1, 0); for my $j ($cluster_delimiters[$i]..$cluster_delimiters[$i+1]-1) {#for all hsps of the current cluster $group1_cdna_start = $hsps[$j]->query->start if $hsps[$j]->query->start < $group1_cdna_start; $group1_cdna_stop = $hsps[$j]->query->end if $hsps[$j]->query->end > $group1_cdna_stop; $group1_best_hsp_score = $hsps[$j]->score() if $hsps[$j]->score() > $group1_best_hsp_score; } if ($group1_cdna_stop-$group1_cdna_start > $largest_cluster_cdna_stop- $largest_cluster_cdna_start) { ($largest_cluster_cdna_stop,$largest_cluster_cdna_start) = ($group1_cdna_stop,$group1_cdna_start); $largest_cluster = $i; } if ( ($group1_cdna_stop-$group1_cdna_start == $largest_cluster_cdna_stop- $largest_cluster_cdna_start) && ($group1_best_hsp_score > $largest_cluster_best_hsp_score ) ) { ($largest_cluster_cdna_stop,$largest_cluster_cdna_start) = ($group1_cdna_stop,$group1_cdna_start); $largest_cluster = $i; $largest_cluster_best_hsp_score = $group1_best_hsp_score; } } for (my $i=0; $i<$k; $i++) { my ($group2_cdna_start,$group2_cdna_stop) = ($big_number,-1); for my $j ($cluster_delimiters[$i]..$cluster_delimiters[$i+1]-1) { $group2_cdna_start = $hsps[$j]->query->start if $hsps[$j]->query->start < $group2_cdna_start; $group2_cdna_stop = $hsps[$j]->query->end if $hsps[$j]->query->end > $group2_cdna_stop; } my $min_overlap=100; if ( ! overlap($largest_cluster_cdna_start, $largest_cluster_cdna_stop, $group2_cdna_start, $group2_cdna_stop, $min_overlap) ){ return 3; # at least one of the genomic clusters doesn't overlap on cDNA, # so return value that will cause calling function to call to print_many_hits } } my ($best_start, $best_stop) = ($big_number,0); #look for earliest start and latest stop among hsps in the largest (in cdna length) cluster for my $j ($cluster_delimiters[$largest_cluster]..$cluster_delimiters[$largest_cluster+1]-1) { $best_start = $hsps[$j]->hit->start if $hsps[$j]->hit->start < $best_start; $best_stop = $hsps[$j]->hit->end if $hsps[$j]->hit->end > $best_stop; } return (2, $best_start, $best_stop); } } # #################### # test_too_short # in: # $qid: the name of the cdna sequence # $sim4result: the standard output of a sim4 run on one cdna seq and one genomic region sequence # $fixed_cdna_seq: the genomic sequence found to correspond with the cdna # $cdna_seq: the cdna sequence # $cdna_length : integer # $exons_ptr : pointer to an array of strings representing exons from the sim4 result # $hitname: the name of the hit genomic sequence (from the blast) # $fastacmd_strand: a 1 or a 2, depending on orientation of blast match to genomic seq in db # $lowest_good_pos: beginning of the range that was used to fetch genomic (used in case genomic must be expanded) # $highest_good_pos end of the range that was used to fetch genomic (used in case genomic must be expanded) # # out: returns a list of one or two values. The first is an integer with the following meaning: # 1 - Sequence is either too short (missing terminal hom) or is missing internal homology. # This tells the calling function not to write the sequence out as a good hit # 0 - Sequence does not fall into the situation described in "1". In this case, the sequence either # needs to be written out (in which case the second returned value is a 0) or has already been # written out (in which case the second returned value is a 1) # # # If the first sim4 alignment doesn't cover the 5' or 3' end of the cdna, expand the genomic region used # in the alignment to see if we can cover the ends. # Returns false (i.e. "not too short") if the sequence covers the cdna, or if it is able to # expand the genomic sequence to cover the cdna (via repeated sim4 call and a call to test_too_short2). # Returns true if the the expanded genomic fails to cover the cdna ends, or results # in missing internal homology (represented by the "==" entry in the sim4 result. # #################### sub test_too_short { my ($qid, $sim4result, $fixed_cdna_seq, $cdna_seq, $cdna_length,$exons_ptr, $hitname, $fastacmd_strand, $lowest_good_pos, $highest_good_pos) = @_; my $wrote_seqs = 0; #dereference the exons pointer and match to the array element my ($first) = (@$exons_ptr[0] =~ m/\n(\d+)/); #start position (of query seq) of the first exon my ($last) = (@$exons_ptr[-1] =~ m/-(\d+)/); #end position (of query seq) of the last exon #e.g. 1-200 ... 2010-2200 will give : $first = 1, $last = 2200 my $length_minus_discrep = $cdna_length-$end_discrep; #If too short, expand the genomic sequence and check alignment again with a repeated sim4 call if ( $first > $begin_discrep || $last < $length_minus_discrep ) { # This has the chance of picking up the missing flanking region. # (it's a near-duplicate of the earlier sim4 handling procedure) if ($first > $begin_discrep){ if ($fastacmd_strand == 1) { $lowest_good_pos -= $flanking_ext_increase; } else { $highest_good_pos += $flanking_ext_increase; } } if ($last < $length_minus_discrep) { if ($fastacmd_strand == 1) { $highest_good_pos += $flanking_ext_increase; } else { $lowest_good_pos -= $flanking_ext_increase; } } $lowest_good_pos = 0 if ($lowest_good_pos < 0); my $cmd2 = "$fasta_bin -d $database -p F \\ -s $hitname -S $fastacmd_strand -L $lowest_good_pos,$highest_good_pos"; my $genomic_seq2 = `$cmd2`; if (!$genomic_seq2) { print FASTA_ERRORS "$cmd2 - fasta cmd failed to get a sequence\n"; } open (TEMPSEQ, ">$files_dir$cdna_seq_file.c"); print TEMPSEQ ">$qid\n$cdna_seq"; close TEMPSEQ; open (TEMPSEQ, ">$files_dir$genomic_seq_file.c"); print TEMPSEQ "$genomic_seq2"; close TEMPSEQ; #sim4 alignment number 2 my $sim4result2 = `$sim4_bin $files_dir$cdna_seq_file.c $files_dir$genomic_seq_file.c A=$sim4_A, K=$sim4_K`; `rm $files_dir$cdna_seq_file.c`; `rm $files_dir$genomic_seq_file.c`; my $sim4result2_orig = $sim4result2; #Look for "==" in sim4 output. This indicates missing internal homology. # Dump into $mystery_seqs.out if ($sim4result2 =~ /==/) { print MYSTERY "-------------\n$qid (expanded genomic seq sim4 test)\n$sim4result2\n\n"; print MYSTERY_SEQ ">$qid $non_homol_intern_fasta_line\n$cdna_seq\n"; return 1; # too short, and it's a mystery sequence } my ($cdna_length) = ($sim4result2 =~ m/seq1.*?(\d+)\sbp/); $sim4result2 =~ s/(seq\d.*?\n)//g; #trim off the info preceeding the exon position info my @exons2 = split(/(?<=%)\s+(?:->|<-)/,$sim4result2); #split on -> or <- $genomic_seq2 =~ s/^.*?\n//; #remove the first line (gene id, etc.) $genomic_seq2 =~ s/\n//g; #remove all newlines (compact string to just bases) my ($orf_ins, $orf_del, $orf_ins_count, $orf_del_count, $orf_sub, $utr5_ins,$utr5_del,$utr5_ins_count,$utr5_del_count,$utr5_sub, $utr3_ins,$utr3_del,$utr3_ins_count,$utr3_del_count,$utr3_sub, $frame_shifted, $stop_added, $orf_start, $orf_stop ) = count_alterations($sim4result2_orig); my $fixed_cdna_seq2 = ""; foreach my $exon_string2 (@exons2) { my ($start, $stop, $pct) = ($exon_string2 =~ m/\((\d+)-(\d+)\)\s+(\d+)\%/); $fixed_cdna_seq2 .= substr($genomic_seq2,$start-1,$stop-$start+1); } #check the starts and stops of the sim4 alignment--if doesn't start at #first cdna position or last cdna position, print out to too_short.out and #if return is true, get out of subroutine (return) my $too_short_test = test_too_short_two($qid, $sim4result2, $fixed_cdna_seq2, $cdna_seq, $cdna_length,\@exons2); if ( $too_short_test > 0 ){ return 1;#return true...the match was too short } if ($sim4result !~ /==/) { #don't want to print to altered if I'm about to print to mystery if ($orf_sub || $orf_ins || $orf_del || $utr5_ins || $utr5_del || $utr5_sub || $utr3_ins || $utr3_del || $utr3_sub) { print ONEHIT ">$qid $one_hit_altered_fasta_line (expanded genomic seq sim4 test)\n"; print ONEHIT "$fixed_cdna_seq2\n"; print ALTERED "$qid $non_homol_term_fasta_line\t"; print ALTERED join ("\t", ($utr5_ins,$utr5_del,$utr5_sub, $orf_ins, $orf_del, $orf_sub, $utr3_ins,$utr3_del,$utr3_sub, $frame_shifted?'T':'F', $stop_added?'T':'F')) . "\n"; print ALTERED_ALIGN "$qid$sim4result\n\n"; } else { print ONEHIT ">$qid $one_hit_perfect_fasta_line (expanded genomic seq sim4 test)\n"; print ONEHIT "$fixed_cdna_seq2\n"; } $wrote_seqs = 1; } } #The sim4result is not too short, Look for "==" in sim4 output, indicating missing internal homology. # dump into $mystery_seqs.out, and return 1 so that it gets out of process_sim4 without writing to ONE_HIT and ALT #note that we have a test of "==" twice in this function. This one only gets called if the other one fails. if ($sim4result =~ /==/) { print MYSTERY "-------------\n$qid\n$sim4result\n\n"; print MYSTERY_SEQ ">$qid $non_homol_intern_fasta_line\n$cdna_seq\n"; return 1; } # The "0" below means "false" (not too short). # The variable tells if the seq has already been written out return (0, $wrote_seqs); } # #################### # test_too_short_two # in: # $qid: the name of the cdna sequence # $sim4result2: the standard output of a sim4 run on one cdna seq and one genomic region sequence # $fixed_cdna_seq2: the genomic sequence found to correspond with the cdna # $cdna_seq: the cdna sequence # $cdna_length : integer # $exons_ptr : pointer to an array of strings representing exons from the sim4 result # # out: 1 if too short, 0 otherwise # # Basically a repeat of test_too_short, without the repeated attempt to get # longer genomic for a new sim4 alignment # #################### sub test_too_short_two { my ($qid, $sim4result2, $fixed_cdna_seq2, $cdna_seq, $cdna_length,$exons_ptr) = @_; #dereference the exons pointer and match to the array element my ($first) = (@$exons_ptr[0] =~ m/\n(\d+)/); #start position (of query seq) of the first exon my ($last) = (@$exons_ptr[-1] =~ m/-(\d+)/); #end position (of query seq) of the last exon #e.g. 1-200 ... 2010-2200 will give : $first = 1, $last = 2200 my $length_minus_discrep = $cdna_length - $end_discrep; if ( $first > $begin_discrep || $last < $length_minus_discrep ) { print TOOSHORT "------$qid\n$sim4result2"; print TOOSHORT "$qid\n"; if ($first > $begin_discrep){ print TOOSHORT "first=$first\n" ; my $str = substr ($cdna_seq, 0, $first-1); print TOOSHORT "first substring: $str\n"; print TOOSHORT_SNIPPETS ">$qid(start)\n$str\n"; } if ($last < $length_minus_discrep) { print TOOSHORT "last=$last (should have been $cdna_length minus $end_discrep)\n" ; my $str = substr ($cdna_seq, $last, $cdna_length-$last); print TOOSHORT "last substring: $str\n"; print TOOSHORT_SNIPPETS ">$qid(end)\n$str\n"; } print TOOSHORT "\n-----\n"; print TOOSHORTSEQS ">$qid $non_homol_term_fasta_line\n"; print TOOSHORTSEQS "$fixed_cdna_seq2\n"; return 1; #return a value = true } return 0; } }; #end of the eval wrapper if ($@) { open (ERROUT, ">$files_dir/ERROR"); print ERROUT $@; }