Skip to content

Commit

Permalink
add scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
dulunar committed Sep 19, 2021
1 parent 2160f81 commit 2b3ad16
Show file tree
Hide file tree
Showing 25 changed files with 2,261 additions and 0 deletions.
204 changes: 204 additions & 0 deletions Scripts/1.preprocess/Median.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
#!usr/bin/perl -w
use strict;
use Getopt::Long;

my $red = "\033[0;31m";
my $end = "\033[0m";

my($in,$out,$len,$chr,$help);

GetOptions
(
"i=s"=>\$in,
"o=s"=>\$out,
"l=i"=>\$len,
"ch=s"=>\$chr,
"help|?"=>\$help
);

my $usage=<<INFO;
Notice:
${red}The input .sam/.bam file must be sorted according to coordinate.$end
Usage:
perl $0 [options]
Options:
-i <string> <in.input sam/bam file>
-o <string> <out.output median of each positions>
-l <int> <the longest length of reads, default=150>
-ch <string> <the number of chromosome>
INFO

die $usage if ($help || !$in || !$out ||!$chr);

die "$red$in does not exists$end\n" if(!(-e "$in"));

if($in =~ /\.bam$/)
{
my $head = `samtools view -H $in | grep SO`;
$head =~ s/^\s+|\s+$//g;
die "$red$in is not sorted bam file$end\n" if($head !~ /coordinate/);
my $bai=$in;
$bai=~s/\.bam/\.bai/g;
if( (!-e "${in}.bai") || (!-e "$bai") )
{
`samtools index -@ 20 $in`;
}
open IN,"samtools view $in $chr |" || die $!;
}
else
{
my $so = &Sorted($in);
die "$red$in is not sorted sam file$end\n" if($so == 0);
if($in =~ /\.gz$/)
{
open IN,"gzip -dc $in | awk '$3==$chr' | " || die $!;
}
else
{
open IN,"grep -w $chr $in |" || die $!;
}
}

sub Sorted
{
my $in = @_;
if($in =~ /\.gz/)
{
open IN, "gzip -dc $in | grep -v '^\@' | " || die $!;
}
else
{
open IN, "$in | grep -v '^\@' | " || die $!;
}

my $read = 0;
my $pos = 0;
my $tags = 0;
my $tagu = 0;
my $chrt;
while(my $line = <IN>)
{
chomp $line;
next unless($line !~ /^@/);
$read ++;
last if($read > 2000);
my @F = split /\t/, $line;
if($read == 1)
{
$pos = $F[3];
$chrt = $F[2];
}
else
{
last if($F[2] ne $chrt);
if($F[3] >= $pos)
{
$tags ++;
$pos = $F[3];
}
else
{
$tagu ++;
}
}
}

my $sort = ($tags > $tagu) ? 1 : 0;
return $sort;
}

if($out =~ /\.gz$/)
{
open OUT,"| gzip > $out" || die $!;
}
else
{
open OUT,"| gzip > $out.gz" || die $!;
}

$len ||= 150;

#initialization of @Median
my @Median = ();
for my $i(0..($len-1))
{
$Median[$i] = [];
}

sub Find_Median
{
my $remainder = $#_%2;
my $quotient = int($#_/2);
my $the_Median;
if($remainder == 0)
{
$the_Median = $_[$quotient];
}
else
{
$the_Median = int( ( $_[$quotient+1] + $_[$quotient] ) / 2 );
}
return $the_Median;
}

sub Out_Queue
{
my $median_temp = shift @_;
my @out;
for(my $j = 0;; $j++)
{
last if(!$median_temp->[$j]);
push(@out, $median_temp->[$j]);
}
return @out;
}

my $gap;
my $aloci = 0;
my @median = ();
my $median_value = 0;
while(<IN>)
{
chomp;
my @temp=split/\s+/, $_;
next if($temp[2] ne $chr);
$gap = $temp[3] - $aloci;
if($aloci != 0 and $gap != 0)
{
#progress of median finding
for(my $i = 1; $i <= $gap; $i++)
{
last if !$Median[0]->[0];
@median = &Out_Queue(@Median);
shift @Median;
@median = sort{ $a<=>$b } @median;
$median_value = &Find_Median(@median);
my $print_loci = $aloci + $i - 1;
print OUT "$print_loci\t$median_value\n";
@median = ();
$Median[$len-1] = [];
}
}
my @quality = split '',$temp[10];

#adding the new qualities
for my $k(0..$#quality)
{
my $quality_score = ord($quality[$k])-33;
push(@{$Median[$k]}, $quality_score);
}
$aloci = $temp[3];
}

#print out the rest elements in @Median
while($Median[0]->[0]){
@median = &Out_Queue(@Median);
shift @Median;
@median = sort {$a<=>$b} @median;
$median_value = &Find_Median(@median);
print OUT "$aloci\t$median_value\n";
$aloci++;
}

close IN;
close OUT;
68 changes: 68 additions & 0 deletions Scripts/1.preprocess/PQ_g4predict_seq.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#!usr/bin/perl -w
use strict;
use Getopt::Long;
use File::Spec;
use File::Basename;

my $red = "\033[0;31m";
my $end = "\033[0m";

my ($in,$out,$fasta,$help);

GetOptions(
"i=s"=>\$in,
"f=s"=>\$fasta,
"o=s"=>\$out
);

my $usage=<<INFO;
Usgae:
perl $0 [options]
Options:
-i file, input PQ file, the directory of $red\$in will be used for save the output$end
-f file, input fasta file
-o file, the prefix of the output PQ with seq file, ${red}default [PQ_g4predict] [the output is \$out_plus && \$out_minus]$end
INFO

die $usage if ($help || !$in || !$fasta);

my $dir = dirname(File::Spec->rel2abs( $in ));
$dir=~s/^\s+|\s+$//g;

$out ||= "PQ_g4predict";

open IN,"<$in" || die $!;
open OUTP,">$dir/${out}_plus" || die $!;
open OUTM,">$dir/${out}_minus" || die $!;
open INF,"<$fasta" || die $!;

my @ref=('N');
while(<INF>)
{
next if $_ =~ />/;
chomp;
my @fasta=split("",$_);
push(@ref,@fasta);
}
close INF;

while(<IN>)
{
chomp;
my @temp = split/\s+/,$_;
my $pqs = $temp[1] + 1;
my $pqe = $temp[2];
my $line = join('',@ref[$pqs..$pqe]);
my $length = length($line);
if($temp[5] eq "+")
{
print OUTP "$pqs\t$pqe\t$length\t$line\t$temp[4]\t$temp[5]\n";
}
elsif($temp[5] eq "-")
{
print OUTM "$pqs\t$pqe\t$length\t$line\t$temp[4]\t$temp[5]\n";
}
}
close IN;
close OUTP;
close OUTM;
92 changes: 92 additions & 0 deletions Scripts/1.preprocess/SplitFa2Chr_G4Predict.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#!/usr/bin/perl -w
#########################################################################
# FileName: SplitFa2Chr_G4Predict.pl
# Version: 96a78e56-2c37-4de2-9a77-1ce128c2d58e
# Author: Luna <[email protected]>
# CreatedTime: Thu Sep 9 16:58:20 2021
#########################################################################
use strict;
use Getopt::Long;
use File::Basename;
my $red = "\033[0;31m";
my $end = "\033[0m";
my ($in, $dir, $py2, $G4Predict, $help);

GetOptions
(
"i|in=s"=>\$in,
"d|dir=s"=>\$dir,
"p|py=s"=>\$py2,
"g|g4=s"=>\$G4Predict,
"help|?"=>\$help,
);


my $usage=<<INFO;
Usage:
${red}1. Split the whole genome reference fasta to each chr fasta$end
${red}2. Using g4predict intra for predicting putative G Quadruplexes in each chromosome$end
perl $0
-i <the whole genome reference fasta file>
${red}-d <the directory for saving the chromosome fa file> $end
-p <the absolute path of python2> (default: `which python2`)
-g <the absolute path of g4predict> (default: `which g4predict`)
INFO

die "$usage\n" if ($help || !$in || !$dir);

$py2 ||= `which python2`;
$py2=~s/^\s+|\s+$//g;
die "${red}There did not install python2 $py2$end\n" if(!$py2);

$G4Predict ||= `which g4predict`;
$G4Predict=~s/^\s+|\s+$//g;
die "${red}There did not install g4predict $G4Predict$end\n" if(!$G4Predict);

if($in =~ /\.gz$/)
{
open IN, "gzip -dc $in |" || die $!;
}
else
{
open IN,"$in" || die $!;
}

my @chrs = ();

while (<IN>)
{
chomp;
if ($_ =~ />(.*)/)
{
my $chr = $1;

my $out = "$chr.fa";
if(!-e "$dir/$chr")
{
`mkdir -p $dir/$chr` || die "${red}Error: can't create $dir/$chr$end\n";
}
push(@chrs, $chr);
open OUT, "> $dir/$chr/$out" || die $!;
print OUT "$_\n";
}
else
{
print OUT "$_\n";
}
}
close IN;
close OUT;

for my $chr(@chrs)
{
my $fa = "$dir/$chr/$chr.fa";
my $cmd = "$py2 $G4Predict intra -f $fa -b $dir/$chr/PQ_g4predict.bed -s -M";

print "NOTICE: run this command\n${red}$cmd$end\n";
system('$cmd');
}

print "${red}$in was splitted into chromosome fa in $dir$end\n";
print "${red}PQs of each chromosome were predicted by $G4Predict, saved into $dir/chr*$end\n";
9 changes: 9 additions & 0 deletions Scripts/1.preprocess/runG4Predict.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash
#########################################################################
# FileName: runG4Predict.sh
# Version: 5aee49a0-a543-485d-aaa2-ac0f51b18e69
# Author: Luna <[email protected]>
# CreatedTime: Wed Sep 8 22:53:27 2021
#########################################################################

python2 /usr/local/bin/g4predict intra -f chr22 -b chr22.bed -s -M
Loading

0 comments on commit 2b3ad16

Please sign in to comment.