Skip to content

Commit

Permalink
keeping strand information
Browse files Browse the repository at this point in the history
  • Loading branch information
daaaaande committed Jan 10, 2019
1 parent beffb7f commit 64921c1
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 116 deletions.
2 changes: 1 addition & 1 deletion matrix_infile_checker.pl
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
if($namesmale=~/\-/){
warn "line $i file $linfile: samplename $namesmale does have a minus in it! \n";
$mist++;

}


if($mist > 0){
Expand Down
197 changes: 91 additions & 106 deletions matrixmaker-V3_A.pl
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
# - takes the outfiles from steptwo (processed.csv)
# - for more than one sample (more useful matrix) cat the .processed.csv files into one big file, the rest will be handled by matrixmaker.pl
# - adds a little relevant information to each candidate
# - needs an output filename- itwill output in .tsv format to be readable for matrixtwo.pl (look at the bottom of this file for example lines for each relevant file)
# - needs an output filename- it will output in .tsv format to be readable for matrixtwo.pl (look at the bottom of this file for example lines for each relevant file)
# - tracks time of usage
# - dumps errors into logfile into global logfile
##########################################
Expand All @@ -34,7 +34,6 @@
my$genene=$slit[0];
$genene =~ s/\s+//g; # remove emptieness
my$nnum="";

if(scalar(@slit)>1){
$nnum=$slit[1];# will be key
if($nnum=~/N/){
Expand Down Expand Up @@ -62,7 +61,7 @@
DATA_IN:
for (my$i=0;$i<scalar(@allelines);$i++){
my$line_o_o=$allelines[$i]; # current line
# if ($i>0){# ignore header
if (!($line_o_o=~/coordinates/)){# ignore header if there , but should not be present at this stage anyway
get_names($line_o_o);
sub get_names{# later multithread this also
my$line= shift(@_);
Expand All @@ -77,13 +76,14 @@
if(!(exists($alle_new_info_try_hash{"$cord"}))){ # check for coords redundancy
$alle_new_info_try_hash{"$cord"}=$i;
$basic_info_hash{"$Refseqid\t$i"}=$i;
$alle_coords_hash{"$cord\t$i"}=$i;
$alle_coords_hash{"$cord\t$strand"}=$i;
}

}
# }
}
}
# without tie to sort the hashes we need to sort the hash keys by their value- i.e the line number of the infile

# without tie to keep the hashes in order we need to sort the hash keys by their value- i.e the line number of the infile
# sort the hash keys based on their values (line number in infile )
my@sorted_by_val_allenames = sort{$sample_hash{$a} <=> $sample_hash{$b}} keys %sample_hash;# sort the keys according to their value
my@allenames=@sorted_by_val_allenames; # sample names
Expand All @@ -104,7 +104,7 @@
chomp $circline;
if($circline=~/[a-z]/){
my@slit=split(/\s+/,$circline);
my$chr=$slit[0]; # plan ; chr:start-end to regex the coordinates of candidates
my$chr=$slit[0];
my$cordst=$slit[1];
my$cordnd=$slit[2];
my$circname=$slit[3];
Expand All @@ -129,7 +129,7 @@
chomp $outfile;
open(OU,">",$outfile)|| die "$!";
############################################# get stable header, build resizeable header for samples
print OU "coordinates\tRefseqID\tGene\tknown_circ\tnum_samples_present\ttotal_sum_unique_counts\tqualities\tpresent_in_sample\t";
print OU "coordinates\tstrand\tRefseqID\tGene\tknown_circ\tnum_samples_present\ttotal_sum_unique_counts\tqualities\tpresent_in_sample\t";
foreach my $sampls (@allenames) {
print OU "sample\t-unique_count\t-qualA\t-qualB\t"; # $sampls not in same order as below, need to change it
}
Expand All @@ -146,114 +146,99 @@ sub findc{
DATA_OUT:
foreach my $circs (@allecooords){
$count ++;
$pf->start and next DATA_OUT;
$pf->start and next DATA_OUT;
find_circ($circs);
sub find_circ {
my$circ= shift(@_);
my$circcand=$circ;
$circcand=~s/\t[0-9]{1,30}//;# remove the attached infile line number to not ignore strand differences
#print "circ $circ\n\n\n\n\n";
my$basicinfo=$allebasicinfo[$count -1];
$basicinfo=~s/\t[0-9]{1,30}//;# remove the unique makwer of maybe double circ information
if($basicinfo=~/[A-z]/g){
chomp $circcand;
my$circn="unknown";
chomp $basicinfo;
my$presencething=""; # for each circ cand, add names of sapmles where it is present
my$totalcounts=0; # for each circ cand, add unique counts
my$allquas=""; # for each sample, summarize qualities
my$line="$circcand\t$basicinfo\t";
$line=~s/\n//g;
# print "line is $basicinfo\n";
$basicinfo =~ /N*[\+\-]{1}/; # find refseqid
my$tolookup = $';
chomp $tolookup;
# print "finding information for circ $tolookup with $basicinfo \n";
$tolookup="N$tolookup"; # fix for missing N in refseqid
$tolookup =~s/\s+//g;
my$allsamplelines="";
my$allsamplehit=0;
my$gene_name="";
if(exists($mapping{$tolookup})){
my$geneo=$mapping{$tolookup};
$line="$line\t$geneo";
$gene_name=$geneo;
# print "found gene $gene_name for circ $tolookup in gene mapping hash\n";
}
else {
$line="$line\tunkn";
$gene_name="unkn";
}
if(exists($known_circs{$circcand})){
$circn=$known_circs{$circcand};
}
else{
$circn="unknown";
}
foreach my $single_sample (@allenames) {# looking for each sample for each circ
#print "sample $single_sample\n";
my$allonesample= $allinfoonesamplehash{$single_sample};
if($allonesample=~/$circcand*.*\n/gi){### is the circ is found in sample###
my$line_of_i=$&;
my$lineonesample=$line_of_i; #declare the interesting line
#print "starting to destruct line $lineonesample\n";
$lineonesample=~s /$circcand//;
$lineonesample=~s/\n//;
chomp $lineonesample;
$lineonesample=~s/\t+\+//; # removing the strand information from the hit
$lineonesample=~s/\t+\-//;
# has still the refseq id for every sample , need to remove that redundant informastion aswell
$lineonesample =~ s/NM_[0-9]{3,11}//g;
$lineonesample =~ s/NR_[0-9]{3,11}//g;
$lineonesample =~ s/\.\s+//; # first remove the dot with space
$lineonesample =~ tr/\.//;# then withpout
$lineonesample =~ s/chr[0-9]{0,3}.*\-[0-9]{1,98}\s+?//g;# remove coords sometimes mixed up in here
$allsamplelines="$allsamplelines$lineonesample";
if($allsamplelines=~/chr/){
warn "error in file: $allsamplelines should not include coordinates , whats the problem?\nfull line: $line_of_i\n";
my$circcand= shift(@_);
#print "old line is $circcand\n";
$circcand=~s/\t[0-9]{1,30}//;# remove the attached infile line number to not ignore strand differences
# remove strand, attach later in full line
$circcand=~s/\t[+-]{1}//;
my$str=$&;
my$basicinfo=$allebasicinfo[$count -1];
$basicinfo=~s/\t[0-9]{1,30}//;# remove the unique makwer of maybe double circ information
if($basicinfo=~/[A-z]/g){
chomp $circcand;
my$circn="unknown";
chomp $basicinfo;
my$presencething=""; # for each circ cand, add names of sapmles where it is present
my$totalcounts=0; # for each circ cand, add unique counts
my$allquas=""; # for each sample, summarize qualities
my$line="$circcand\t$basicinfo\t";
$line=~s/\n//g;
my$tolookup=$basicinfo;
chomp $tolookup;
my$allsamplelines="";
my$allsamplehit=0;
my$gene_name="";
if(exists($mapping{$tolookup})){
my$geneo=$mapping{$tolookup};
$line="$line\t$geneo";
$gene_name=$geneo;
# print "found gene $gene_name for circ $tolookup in gene mapping hash\n";
}
$presencething="$presencething-$single_sample";
$lineonesample =~/\s+[0-9]{1,4}\s+/;# only first hit is unique count
my$findnum = $&; # the unique count for each sample
my$twoquals=$'; # the two qualities into one
$twoquals =~ s/\s+/;/;
if(!($allquas=~/N/g)){ # check for refseqid instead of number
$allquas = "$allquas,$twoquals";
$allquas =~s/\s+//g;
$totalcounts=$totalcounts + $findnum;
$ni=$totalcounts;
$allsamplehit++;
else {
$line="$line\tunkn";
$gene_name="unkn";
}
if(exists($known_circs{$circcand})){
$circn=$known_circs{$circcand};
}
else{
# refseqid is recognized as strand...
warn "line not recognized :$line_of_i ,quality is not $allquas or $twoquals\t totalcounts are $totalcounts for sample $single_sample circ $circcand basicinfo $basicinfo \n";
$circn="unknown";
}
}
else{# new: if circ not in all one samples
chomp $single_sample;
foreach my $single_sample (@allenames) {# looking for each sample for each circ
my$allonesample= $allinfoonesamplehash{$single_sample};
if($allonesample=~/$circcand*.*\n/gi){### is the circ is found in sample###
my$line_of_i=$&;
my$lineonesample=$line_of_i; #declare the interesting line
$lineonesample=~s /$circcand//;
$lineonesample=~s/\n//g;
$lineonesample=~s/\t+[\+\-]//; # removing the strand information from the hit
$lineonesample =~ s/N[MR]_[0-9]{3,11}//g; # removing refseqid- is the same for the same coordinates
$lineonesample =~ s/chr[0-9]{0,3}.*\-[0-9]{1,98}\s+?//g;# remove coords sometimes mixed up in here
$allsamplelines="$allsamplelines$lineonesample";
if($allsamplelines=~/chr/){
warn "error in file: $allsamplelines should not include coordinates , whats the problem?\nfull line: $line_of_i\n";
}
$presencething="$presencething-$single_sample";
$lineonesample =~/\s+[0-9]{1,4}\s+/;# only first hit is unique count
my$findnum = $&; # the unique count for each sample
my$twoquals=$'; # the two qualities into one
$twoquals =~ s/\s+/;/;
if(!($allquas=~/N/g)){ # check for refseqid instead of number
$allquas = "$allquas,$twoquals";
$allquas =~s/\s+//g;
$totalcounts=$totalcounts + $findnum;
$ni=$totalcounts;
$allsamplehit++;
}
else{
# refseqid is recognized as strand...
warn "line not recognized :$line_of_i ,quality is not $allquas or $twoquals\t totalcounts are $totalcounts for sample $single_sample circ $circcand basicinfo $basicinfo \n";
}
}
else{# new: if circ not in all one samples
chomp $single_sample;
$allsamplelines="$allsamplelines$single_sample\t0\t0\t0\t";
}
}
$basicinfo=~s/\n//g;
$gene_name=~s/\n//g;
if(((($circcand=~/\:/)&&($presencething=~/[A-z]/)))){
#hr10:93590679-93602148 - NM_001142434 OGA hsa_circ_0008170
my$linestring="$circcand\t$basicinfo\t$gene_name\t$circn\t$allsamplehit\t$ni\t$allquas\t$presencething\t$allsamplelines\n";
$linestring =~s/\t\t/\t/g;
print OU $linestring;
$linestring="";
$gene_name="";
}
else{ # in case something with the line is wrong
print ER "error in line: circand is $circcand \n basicinfo is $basicinfo \n and presencething is $presencething\n";
}
}
}
$basicinfo=~s/\n//g;
$gene_name=~s/\n//g;
if(((($circcand=~/\:/)&&($presencething=~/[A-z]/)))){
my$linestring="$circcand\t$str\t$basicinfo\t$gene_name\t$circn\t$allsamplehit\t$ni\t$allquas\t$presencething\t$allsamplelines\n";
$linestring =~s/\t\t/\t/g;
print OU $linestring;
$linestring="";
$gene_name="";
}
else{ # in case something with the line is wrong
print ER "error in line: circand is $circcand \n basicinfo is $basicinfo \n and presencething is $presencething\n";
}
}

}
# print "ends line $count ..\n";
$pf->finish;
$pf->finish;
}

}# findc end
$pf->wait_all_children;
my$end=time;
Expand Down
18 changes: 9 additions & 9 deletions matrixtwo.pl
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#/usr/bin/perl -w
use strict;
require "/media/daniel/NGS1/RNASeq/find_circ/read_mapping.pl";
use List::MoreUtils qw(uniq);
#system("clear");
require "/media/daniel/NGS1/RNASeq/find_circ/read_mapping.pl"; # module reading mapping file for additional information- can be ignored when useless
use List::MoreUtils qw(uniq); # used to get to a list of unique samplenames later


############################################################ usage
# perl matrixtwo.pl matrixmaker_outfile.csv matrixtwo_output.tsv
Expand All @@ -29,7 +29,6 @@
#
#


open(ER,'>>',"/home/daniel/logfile_auto.log")||die "$!"; # global logfile
my $start = time;

Expand Down Expand Up @@ -112,10 +111,12 @@
# getting relevant information for each circ candidate ...
my@lineparts=split(/\t/,$longline);
my$coords=$lineparts[0];
my$strand=$lineparts[1];
my$refseqID=$lineparts[2];
my$gene=$lineparts[3];
my$circn=$lineparts[4];
# adding hallmark gene type
# print "refseqid is $refseqID\n";
if(grep(/$gene/,@allehallmarkg)){ # get all samplenames into @allenames)
# print "looking for $gene in gene mapping ---\n";
# find hallmark class and add to matrix file
Expand All @@ -141,8 +142,7 @@
$marti="NaN";
}

push(@headers,"$coords\t$refseqID\t$gene\t$circn\t$hallm\t$marti\t");# header into array
#if($var==1){# to test first only the second line , later all lines
push(@headers,"$coords\t$strand\t$refseqID\t$gene\t$circn\t$hallm\t$marti\t");# header into array
my$e=0;
my$allthings="";
foreach my$samplepos (@sampleuniqc){
Expand All @@ -155,8 +155,7 @@
$allthings="$allthings\t$lineparts[$samplepos]";
#$alluniques[$var][$e]=$lineparts[$samplepos];# two-dimensional array
}
push(@alluniques,$allthings);
#}
push(@alluniques,$allthings);# all unique count positions in .mat1 file
}
else{
# header bar only- catch columns with samplenames in it and their position
Expand All @@ -180,14 +179,15 @@
# actual file creation:
# header line in output file ...
my@uniques= uniq @samplenames;
print OUT"coordinates\trefseqid\tgene\tcircn\thallm\tbiom_desc\t";
print OUT"coordinates\tstrand\trefseqid\tgene\tcircn\thallm\tbiom_desc\t";
foreach my $sampl (@uniques){
print OUT"$sampl\t";
}
print OUT "\n";
# now the real content, cleaning it from junk and then printing it
for (my $v = 0; $v < scalar(@headers); $v++) {
my$outline="$headers[$v]$alluniques[$v]";
# messy whitespace cleanup
$outline=~s/\t\t+/\t/g;
$outline=~s/\t\s+/\t/g;
$outline=~s/\s+\t/\t/g;
Expand Down

0 comments on commit 64921c1

Please sign in to comment.