keeping strand information

daaaaande · Jan 10, 2019 · 64921c1 · 64921c1
1 parent beffb7f
commit 64921c1
Show file tree

Hide file tree

Showing 3 changed files with 101 additions and 116 deletions.
diff --git a/matrix_infile_checker.pl b/matrix_infile_checker.pl
@@ -43,7 +43,7 @@
 if($namesmale=~/\-/){
   warn "line $i file $linfile: samplename $namesmale does have a minus in it! \n";
   $mist++;
-
+}
 
 
 if($mist > 0){

diff --git a/matrixmaker-V3_A.pl b/matrixmaker-V3_A.pl
@@ -9,7 +9,7 @@
 #		- takes the outfiles from steptwo (processed.csv)
 #		- for more than one sample (more useful matrix) cat the .processed.csv files into one big file, the rest will be handled by matrixmaker.pl
 #		- adds a little relevant information to each candidate
-#		- needs an output filename- itwill output in .tsv format to be readable for matrixtwo.pl (look at the bottom of this file for example lines for each relevant file)
+#		- needs an output filename- it will output in .tsv format to be readable for matrixtwo.pl (look at the bottom of this file for example lines for each relevant file)
 #		- tracks time of usage
 #		- dumps errors into logfile into global logfile
 ##########################################
@@ -34,7 +34,6 @@
 		my$genene=$slit[0];
 		$genene =~ s/\s+//g; # remove emptieness
 		my$nnum="";
-
 		if(scalar(@slit)>1){
 			$nnum=$slit[1];# will be key
 			if($nnum=~/N/){
@@ -62,7 +61,7 @@
 DATA_IN:
 for (my$i=0;$i<scalar(@allelines);$i++){
 	my$line_o_o=$allelines[$i];	# current line
-#	if ($i>0){# ignore header
+	if (!($line_o_o=~/coordinates/)){# ignore header if there , but should not be present at this stage anyway
 		get_names($line_o_o);
 		sub get_names{# later multithread this also
 			my$line= shift(@_);
@@ -77,13 +76,14 @@
 			if(!(exists($alle_new_info_try_hash{"$cord"}))){ # check for coords redundancy
 				$alle_new_info_try_hash{"$cord"}=$i;
 				$basic_info_hash{"$Refseqid\t$i"}=$i;
-				$alle_coords_hash{"$cord\t$i"}=$i;
+				$alle_coords_hash{"$cord\t$strand"}=$i;
 			}
 
 		}
-#	}
+	}
 }
-# without tie to sort the hashes we need to sort the hash keys by their value- i.e the line number of the infile
+
+# without tie to keep the hashes in order we need to sort the hash keys by their value- i.e the line number of the infile
 # sort the hash keys based on their values (line number in infile )
 my@sorted_by_val_allenames = sort{$sample_hash{$a} <=> $sample_hash{$b}} keys %sample_hash;# sort the keys according to their value
 my@allenames=@sorted_by_val_allenames;		# sample names
@@ -104,7 +104,7 @@
 	chomp $circline;
 	if($circline=~/[a-z]/){
 		my@slit=split(/\s+/,$circline);
-		my$chr=$slit[0];	# plan ; chr:start-end to regex the coordinates of candidates
+		my$chr=$slit[0];
 		my$cordst=$slit[1];
 		my$cordnd=$slit[2];
 		my$circname=$slit[3];
@@ -129,7 +129,7 @@
 chomp $outfile;
 open(OU,">",$outfile)|| die "$!";
 ############################################# get stable header, build resizeable header for samples
-print OU "coordinates\tRefseqID\tGene\tknown_circ\tnum_samples_present\ttotal_sum_unique_counts\tqualities\tpresent_in_sample\t";
+print OU "coordinates\tstrand\tRefseqID\tGene\tknown_circ\tnum_samples_present\ttotal_sum_unique_counts\tqualities\tpresent_in_sample\t";
 foreach my $sampls  (@allenames) {
 	print OU "sample\t-unique_count\t-qualA\t-qualB\t"; # $sampls not in same order as below, need to change it
 }
@@ -146,114 +146,99 @@ sub findc{
   DATA_OUT:
   foreach my $circs (@allecooords){
     $count ++;
-	$pf->start and next DATA_OUT;
+		$pf->start and next DATA_OUT;
     find_circ($circs);
     sub find_circ {
-      my$circ= shift(@_);
-      my$circcand=$circ;
-	$circcand=~s/\t[0-9]{1,30}//;# remove the attached infile line number to not ignore strand differences
-	#print "circ $circ\n\n\n\n\n";
-	my$basicinfo=$allebasicinfo[$count -1];
-	$basicinfo=~s/\t[0-9]{1,30}//;# remove the unique makwer of maybe double circ information
-	if($basicinfo=~/[A-z]/g){
-		chomp $circcand;
-		my$circn="unknown";
-		chomp $basicinfo;
-		my$presencething=""; # for each circ cand, add names of sapmles where it is present
-		my$totalcounts=0;	# for each circ cand, add unique counts
-		my$allquas=""; 		# for each sample, summarize qualities
-		my$line="$circcand\t$basicinfo\t";
-		$line=~s/\n//g;
-		#	print "line is $basicinfo\n";
-		$basicinfo =~ /N*[\+\-]{1}/; # find refseqid
-		my$tolookup = $';
-		chomp $tolookup;
-	#	print "finding information for circ $tolookup with $basicinfo \n";
-		$tolookup="N$tolookup"; # fix for missing N in refseqid 
-		$tolookup =~s/\s+//g;
-		my$allsamplelines="";
-		my$allsamplehit=0;
-		my$gene_name="";
-		if(exists($mapping{$tolookup})){
-			my$geneo=$mapping{$tolookup};
-			$line="$line\t$geneo";
-			$gene_name=$geneo;
-		#		print "found gene $gene_name for circ $tolookup in gene mapping hash\n";
-		}
-		else {
-			$line="$line\tunkn";
-			$gene_name="unkn";
-		}
-		if(exists($known_circs{$circcand})){
-			$circn=$known_circs{$circcand};
-		}
-		else{
-			$circn="unknown";
-		}
-		foreach my $single_sample (@allenames) {# looking for each sample for each circ
-		#print "sample $single_sample\n";
-	  		my$allonesample= $allinfoonesamplehash{$single_sample};
-        		if($allonesample=~/$circcand*.*\n/gi){### is the circ is found in sample###
-				my$line_of_i=$&;
-          			my$lineonesample=$line_of_i; #declare the interesting line
-				#print "starting to destruct line $lineonesample\n";
-      			$lineonesample=~s /$circcand//;
-          			$lineonesample=~s/\n//;
-	    			chomp $lineonesample;
-	    			$lineonesample=~s/\t+\+//;	# removing the strand information from the hit
-	    			$lineonesample=~s/\t+\-//;
-	    			# has still the refseq id for every sample , need to remove that redundant informastion aswell
-	    			$lineonesample =~ s/NM_[0-9]{3,11}//g;
-	    			$lineonesample =~ s/NR_[0-9]{3,11}//g;
-	    			$lineonesample =~ s/\.\s+//; # first remove the dot with space
-	    			$lineonesample =~ tr/\.//;# then withpout
-				$lineonesample =~ s/chr[0-9]{0,3}.*\-[0-9]{1,98}\s+?//g;# remove coords sometimes mixed up in here
-  	      		$allsamplelines="$allsamplelines$lineonesample";
-				if($allsamplelines=~/chr/){
-					warn "error in file: $allsamplelines should not include coordinates , whats the problem?\nfull line: $line_of_i\n";
+      my$circcand= shift(@_);
+			#print "old line is $circcand\n";
+			$circcand=~s/\t[0-9]{1,30}//;# remove the attached infile line number to not ignore strand differences
+			# remove strand, attach later in full line
+			$circcand=~s/\t[+-]{1}//;
+			my$str=$&;
+			my$basicinfo=$allebasicinfo[$count -1];
+			$basicinfo=~s/\t[0-9]{1,30}//;# remove the unique makwer of maybe double circ information
+			if($basicinfo=~/[A-z]/g){
+				chomp $circcand;
+				my$circn="unknown";
+				chomp $basicinfo;
+				my$presencething=""; # for each circ cand, add names of sapmles where it is present
+				my$totalcounts=0;	# for each circ cand, add unique counts
+				my$allquas=""; 		# for each sample, summarize qualities
+				my$line="$circcand\t$basicinfo\t";
+				$line=~s/\n//g;
+				my$tolookup=$basicinfo;
+				chomp $tolookup;
+				my$allsamplelines="";
+				my$allsamplehit=0;
+				my$gene_name="";
+				if(exists($mapping{$tolookup})){
+					my$geneo=$mapping{$tolookup};
+					$line="$line\t$geneo";
+					$gene_name=$geneo;
+					#		print "found gene $gene_name for circ $tolookup in gene mapping hash\n";
 				}
-	      		$presencething="$presencething-$single_sample";
-	      		$lineonesample =~/\s+[0-9]{1,4}\s+/;# only first hit is unique count
-	      		my$findnum = $&; # the unique count for each sample
-	      		my$twoquals=$'; # the two qualities into one
-	      		$twoquals =~ s/\s+/;/;
-				if(!($allquas=~/N/g)){ # check for refseqid instead of number
-	      			$allquas = "$allquas,$twoquals";
-	      			$allquas =~s/\s+//g;
-	      			$totalcounts=$totalcounts + $findnum;
-	      			$ni=$totalcounts;
-	      			$allsamplehit++;
+				else {
+					$line="$line\tunkn";
+					$gene_name="unkn";
+				}
+				if(exists($known_circs{$circcand})){
+					$circn=$known_circs{$circcand};
 				}
 				else{
-					# refseqid is recognized as strand...
-					warn "line not recognized :$line_of_i ,quality is not $allquas or $twoquals\t totalcounts are $totalcounts for sample $single_sample circ $circcand basicinfo $basicinfo \n";
+					$circn="unknown";
 				}
-	  		}
-        		else{# new: if circ not in all one samples
-				chomp $single_sample;
+				foreach my $single_sample (@allenames) {# looking for each sample for each circ
+	  			my$allonesample= $allinfoonesamplehash{$single_sample};
+        		if($allonesample=~/$circcand*.*\n/gi){### is the circ is found in sample###
+							my$line_of_i=$&;
+          		my$lineonesample=$line_of_i; #declare the interesting line
+      				$lineonesample=~s /$circcand//;
+          		$lineonesample=~s/\n//g;
+	    				$lineonesample=~s/\t+[\+\-]//;	# removing the strand information from the hit
+	    				$lineonesample =~ s/N[MR]_[0-9]{3,11}//g; # removing refseqid- is the same for the same coordinates
+							$lineonesample =~ s/chr[0-9]{0,3}.*\-[0-9]{1,98}\s+?//g;# remove coords sometimes mixed up in here
+  	      		$allsamplelines="$allsamplelines$lineonesample";
+							if($allsamplelines=~/chr/){
+								warn "error in file: $allsamplelines should not include coordinates , whats the problem?\nfull line: $line_of_i\n";
+							}
+	      			$presencething="$presencething-$single_sample";
+	      			$lineonesample =~/\s+[0-9]{1,4}\s+/;# only first hit is unique count
+	      			my$findnum = $&; # the unique count for each sample
+	      			my$twoquals=$'; # the two qualities into one
+	      			$twoquals =~ s/\s+/;/;
+							if(!($allquas=~/N/g)){ # check for refseqid instead of number
+	      				$allquas = "$allquas,$twoquals";
+	      				$allquas =~s/\s+//g;
+	      				$totalcounts=$totalcounts + $findnum;
+	      				$ni=$totalcounts;
+	      				$allsamplehit++;
+							}
+							else{
+								# refseqid is recognized as strand...
+								warn "line not recognized :$line_of_i ,quality is not $allquas or $twoquals\t totalcounts are $totalcounts for sample $single_sample circ $circcand basicinfo $basicinfo \n";
+							}
+	  				}
+        	else{# new: if circ not in all one samples
+						chomp $single_sample;
 	      		$allsamplelines="$allsamplelines$single_sample\t0\t0\t0\t";
-	  		}
-  		}
-		$basicinfo=~s/\n//g;
-		$gene_name=~s/\n//g;
-		if(((($circcand=~/\:/)&&($presencething=~/[A-z]/)))){
-			#hr10:93590679-93602148 -       NM_001142434    OGA     hsa_circ_0008170
-  			my$linestring="$circcand\t$basicinfo\t$gene_name\t$circn\t$allsamplehit\t$ni\t$allquas\t$presencething\t$allsamplelines\n";
-	  		$linestring  =~s/\t\t/\t/g;
-	  		print OU $linestring;
-	  		$linestring="";
-			$gene_name="";
-		}
-		else{			# in case something with the line is wrong
-	  		print ER "error in line: circand is $circcand \n basicinfo is $basicinfo \n and presencething is $presencething\n";
-		}
+	  			}
+  			}
+				$basicinfo=~s/\n//g;
+				$gene_name=~s/\n//g;
+				if(((($circcand=~/\:/)&&($presencething=~/[A-z]/)))){
+  				my$linestring="$circcand\t$str\t$basicinfo\t$gene_name\t$circn\t$allsamplehit\t$ni\t$allquas\t$presencething\t$allsamplelines\n";
+	  			$linestring  =~s/\t\t/\t/g;
+	  			print OU $linestring;
+	  			$linestring="";
+					$gene_name="";
+				}
+				else{			# in case something with the line is wrong
+	  			print ER "error in line: circand is $circcand \n basicinfo is $basicinfo \n and presencething is $presencething\n";
+				}
     	}
-
     }
-  #  print "ends line $count ..\n";
-    $pf->finish;
+  $pf->finish;
   }
-
 }# findc end
 $pf->wait_all_children;
 my$end=time;

diff --git a/matrixtwo.pl b/matrixtwo.pl
@@ -1,8 +1,8 @@
 #/usr/bin/perl -w
 use strict;
-require "/media/daniel/NGS1/RNASeq/find_circ/read_mapping.pl";
-use List::MoreUtils qw(uniq);
-#system("clear");
+require "/media/daniel/NGS1/RNASeq/find_circ/read_mapping.pl"; # module reading mapping file for additional information- can be ignored when useless
+use List::MoreUtils qw(uniq); # used to get to a list of unique samplenames later
+
 
 ############################################################ usage
 # perl matrixtwo.pl matrixmaker_outfile.csv matrixtwo_output.tsv
@@ -29,7 +29,6 @@
 #
 #
 
-
 open(ER,'>>',"/home/daniel/logfile_auto.log")||die "$!";		# global logfile
 my $start = time;
 
@@ -112,10 +111,12 @@
     # getting relevant information for each circ candidate ...
     my@lineparts=split(/\t/,$longline);
     my$coords=$lineparts[0];
+		my$strand=$lineparts[1];
     my$refseqID=$lineparts[2];
     my$gene=$lineparts[3];
     my$circn=$lineparts[4];
     # adding hallmark gene type
+	#	print "refseqid is $refseqID\n";
     if(grep(/$gene/,@allehallmarkg)){			# get all samplenames into @allenames)
   #  print "looking for $gene in gene mapping ---\n";
     # find hallmark class  and add to matrix file
@@ -141,8 +142,7 @@
       $marti="NaN";
     }
 
-    push(@headers,"$coords\t$refseqID\t$gene\t$circn\t$hallm\t$marti\t");# header into array
-    #if($var==1){# to test first only the second line , later all lines
+    push(@headers,"$coords\t$strand\t$refseqID\t$gene\t$circn\t$hallm\t$marti\t");# header into array
       my$e=0;
       my$allthings="";
       foreach my$samplepos (@sampleuniqc){
@@ -155,8 +155,7 @@
         $allthings="$allthings\t$lineparts[$samplepos]";
         #$alluniques[$var][$e]=$lineparts[$samplepos];# two-dimensional array
       }
-      push(@alluniques,$allthings);
-    #}
+      push(@alluniques,$allthings);# all unique count positions in .mat1 file
   }
   else{
   # header bar only- catch columns with samplenames in it and their position
@@ -180,14 +179,15 @@
 # actual file creation:
 # header line in output file ...
 my@uniques= uniq @samplenames;
-print OUT"coordinates\trefseqid\tgene\tcircn\thallm\tbiom_desc\t";
+print OUT"coordinates\tstrand\trefseqid\tgene\tcircn\thallm\tbiom_desc\t";
 foreach my $sampl (@uniques){
   print OUT"$sampl\t";
 }
 print OUT "\n";
 # now the real content, cleaning it from junk and then printing it
 for (my $v = 0; $v < scalar(@headers); $v++) {
   my$outline="$headers[$v]$alluniques[$v]";
+	# messy whitespace cleanup
   $outline=~s/\t\t+/\t/g;
   $outline=~s/\t\s+/\t/g;
 	$outline=~s/\s+\t/\t/g;