#!/usr/bin/perl
use strict;
use warnings;
my $file1 = $ARGV[0];
my %chain1; # background seq
my %chain2; # CpG island
my %first_nocpg_first; # 1st nucleotide of the 1st background seq
my %first_nocpg_second; # 1st nucleotide of the 2nd background seq
my %first_yescpg; # 1st nucleotide of the CpG island
open(my $FILE1, '<', $file1) or die "cannot open $file1: $!\n";
while (<$FILE1>) {
chomp;
my ($first_col, $second_col, $seq) = split("\t", $_);
my $cpg_start = $first_col; #cpg_start = primer nt que ES cpg
my $cpg_end = $second_col-1; #cpg_end = último nt que ES cpg
my @nt_seq = split(//, $seq);
my $len = scalar @nt_seq;
my @firstnt_yescpg = $nt_seq[$cpg_start];
my @firstnt_nocpg_first = $nt_seq[0];
my @firstnt_nocpg_second = $nt_seq[$second_col];
my @not_cpg = @nt_seq[0..499, $second_col..$len];
my @yes_cpg = @nt_seq[$cpg_start..$cpg_end];
my $i=1;
for my $nt_nocpg (@not_cpg) {
$chain1{$not_cpg[$i]}{$not_cpg[$i-1]}++;
$i++;
}
my $f=1;
for my $nt_yescpg (@yes_cpg) {
$chain2{$yes_cpg[$f]}{$yes_cpg[$f-1]}++;
$f++;
}
my $j=0;
for my $firstnt_nocpg (@firstnt_nocpg_first) {
$first_nocpg_first{$firstnt_nocpg_first[$j]} += @firstnt_nocpg_first;
$j++;
}
my $k=0;
for my $firstnt_nocpg (@firstnt_nocpg_second) {
$first_nocpg_second{$firstnt_nocpg_second[$k]} += @firstnt_nocpg_second;
$k++;
}
my $l=0;
for my $firstntyescpg (@firstnt_yescpg) {
$first_yescpg{$firstnt_yescpg[$l]} += @firstnt_yescpg;
$l++;
}
}
my $nAA = $chain1{A}{A} / ($chain1{A}{A} + $chain1{C}{A} + $chain1{G}{A} + $chain1{T}{A});
my $nCA = $chain1{C}{A} / ($chain1{A}{A} + $chain1{C}{A} + $chain1{G}{A} + $chain1{T}{A});
my $nGA = $chain1{G}{A} / ($chain1{A}{A} + $chain1{C}{A} + $chain1{G}{A} + $chain1{T}{A});
my $nTA = $chain1{T}{A} / ($chain1{A}{A} + $chain1{C}{A} + $chain1{G}{A} + $chain1{T}{A});
my $nAC = $chain1{A}{C} / ($chain1{A}{C} + $chain1{C}{C} + $chain1{G}{C} + $chain1{T}{C});
my $nCC = $chain1{C}{C} / ($chain1{A}{C} + $chain1{C}{C} + $chain1{G}{C} + $chain1{T}{C});
my $nGC = $chain1{G}{C} / ($chain1{A}{C} + $chain1{C}{C} + $chain1{G}{C} + $chain1{T}{C});
my $nTC = $chain1{T}{C} / ($chain1{A}{C} + $chain1{C}{C} + $chain1{G}{C} + $chain1{T}{C});
my $nAG = $chain1{A}{G} / ($chain1{A}{G} + $chain1{C}{G} + $chain1{G}{G} + $chain1{T}{G});
my $nCG = $chain1{C}{G} / ($chain1{A}{G} + $chain1{C}{G} + $chain1{G}{G} + $chain1{T}{G});
my $nGG = $chain1{G}{G} / ($chain1{A}{G} + $chain1{C}{G} + $chain1{G}{G} + $chain1{T}{G});
my $nTG = $chain1{G}{G} / ($chain1{A}{G} + $chain1{C}{G} + $chain1{G}{G} + $chain1{T}{G});
my $nAT = $chain1{A}{T} / ($chain1{A}{T} + $chain1{C}{T} + $chain1{G}{T} + $chain1{T}{T});
my $nCT = $chain1{C}{T} / ($chain1{A}{T} + $chain1{C}{T} + $chain1{G}{T} + $chain1{T}{T});
my $nGT = $chain1{G}{T} / ($chain1{A}{T} + $chain1{C}{T} + $chain1{G}{T} + $chain1{T}{T});
my $nTT = $chain1{G}{T} / ($chain1{A}{T} + $chain1{C}{T} + $chain1{G}{T} + $chain1{T}{T});
my $preA = $first_nocpg_first{A} + $first_nocpg_second{A};
my $preC = $first_nocpg_first{C} + $first_nocpg_second{C};
my $preG = $first_nocpg_first{G} + $first_nocpg_second{G};
my $preT = $first_nocpg_first{T} + $first_nocpg_second{T};
my $nA = $preA / ($preA + $preC + $preG + $preT);
my $nC = $preC / ($preA + $preC + $preG + $preT);
my $nG = $preG / ($preA + $preC + $preG + $preT);
my $nT = $preT / ($preA + $preC + $preG + $preT);
print "\nFrequencies outside CpG island:\n";
print "A: $nA", "\n";
print "C: $nC", "\n";
print "G: $nG", "\n";
print "T: $nT", "\n\n";
print "A|A: $nAA", "\t", "A|G: $nAG","\n";
print "C|A: $nCA", "\t", "C|G: $nCG","\n";
print "G|A: $nGA", "\t", "G|G: $nGG","\n";
print "T|A: $nTA", "\t", "T|G: $nTG","\n";
print "A|C: $nAC", "\t", "A|T: $nAT","\n";
print "C|C: $nCC", "\t", "C|T: $nCT","\n";
print "G|C: $nGC", "\t", "G|T: $nGT","\n";
print "T|C: $nTC", "\t", "T|T: $nTT","\n\n";
my $yAA = $chain2{A}{A} / ($chain2{A}{A} + $chain2{C}{A} + $chain2{G}{A} + $chain2{T}{A});
my $yCA = $chain2{C}{A} / ($chain2{A}{A} + $chain2{C}{A} + $chain2{G}{A} + $chain2{T}{A});
my $yGA = $chain2{G}{A} / ($chain2{A}{A} + $chain2{C}{A} + $chain2{G}{A} + $chain2{T}{A});
my $yTA = $chain2{T}{A} / ($chain2{A}{A} + $chain2{C}{A} + $chain2{G}{A} + $chain2{T}{A});
my $yAC = $chain2{A}{C} / ($chain2{A}{C} + $chain2{C}{C} + $chain2{G}{C} + $chain2{T}{C});
my $yCC = $chain2{C}{C} / ($chain2{A}{C} + $chain2{C}{C} + $chain2{G}{C} + $chain2{T}{C});
my $yGC = $chain2{G}{C} / ($chain2{A}{C} + $chain2{C}{C} + $chain2{G}{C} + $chain2{T}{C});
my $yTC = $chain2{T}{C} / ($chain2{A}{C} + $chain2{C}{C} + $chain2{G}{C} + $chain2{T}{C});
my $yAG = $chain2{A}{G} / ($chain2{A}{G} + $chain2{C}{G} + $chain2{G}{G} + $chain2{T}{G});
my $yCG = $chain2{C}{G} / ($chain2{A}{G} + $chain2{C}{G} + $chain2{G}{G} + $chain2{T}{G});
my $yGG = $chain2{G}{G} / ($chain2{A}{G} + $chain2{C}{G} + $chain2{G}{G} + $chain2{T}{G});
my $yTG = $chain2{G}{G} / ($chain2{A}{G} + $chain2{C}{G} + $chain2{G}{G} + $chain2{T}{G});
my $yAT = $chain2{A}{T} / ($chain2{A}{T} + $chain2{C}{T} + $chain2{G}{T} + $chain2{T}{T});
my $yCT = $chain2{C}{T} / ($chain2{A}{T} + $chain2{C}{T} + $chain2{G}{T} + $chain2{T}{T});
my $yGT = $chain2{G}{T} / ($chain2{A}{T} + $chain2{C}{T} + $chain2{G}{T} + $chain2{T}{T});
my $yTT = $chain2{G}{T} / ($chain2{A}{T} + $chain2{C}{T} + $chain2{G}{T} + $chain2{T}{T});
my $ypreA = $first_yescpg{A};
my $ypreC = $first_yescpg{C};
my $ypreG = $first_yescpg{G};
my $ypreT = $first_yescpg{T};
my $yA = $ypreA / ($ypreA + $ypreC + $ypreG + $ypreT);
my $yC = $ypreC / ($ypreA + $ypreC + $ypreG + $ypreT);
my $yG = $ypreG / ($ypreA + $ypreC + $ypreG + $ypreT);
my $yT = $ypreT / ($ypreA + $ypreC + $ypreG + $ypreT);
print "\nFrequencies inside CpG island:\n";
print "A: $yA", "\n";
print "C: $yC", "\n";
print "G: $yG", "\n";
print "T: $yT", "\n\n";
print "A|A: $yAA", "\t", "A|G: $yAG","\n";
print "C|A: $yCA", "\t", "C|G: $yCG","\n";
print "G|A: $yGA", "\t", "G|G: $yGG","\n";
print "T|A: $yTA", "\t", "T|G: $yTG","\n";
print "A|C: $yAC", "\t", "A|T: $yAT","\n";
print "C|C: $yCC", "\t", "C|T: $yCT","\n";
print "G|C: $yGC", "\t", "G|T: $yGT","\n";
print "T|C: $yTC", "\t", "T|T: $yTT","\n\n";