use Bio::SeqIO;
sub conteo_palabras_genoma_completo
{
my ($infile,$out_intergenic_file,$min_intergenic_size,$max_intergenic_size) = @_;
my ($n_of_intergenic,$gi,$start,$end,$length,$strand,$taxon) = (0);
# print "ingrese el nombre del archivo:\n";
# $infile=<STDIN>;
# chop($infile);
my $in = new Bio::SeqIO(-file => "NC_002755.gb", -format => 'genbank');
open(SALIDA,">conteo_genoma_completo.txt");
open(LIDER,">lider.FASTA");
open(COMPLEMENTARIA,">complementaria.FASTA");
while( my $seq = $in->next_seq) # scan all sequences found in $input
{
my ($gbaccession,$sequence,$gen,@genes) = ( $seq->accession() );
$sequence = $seq->primary_seq()->seq() ||
'no encuentro la secuencia primaria, necesito un fichero GenBank completo!';
$taxon = '';
for my $f ($seq->get_SeqFeatures)
{
if($f->primary_tag =~ /CDS|rRNA|tRNA/) # campos de 'genes'
{
$gi = '';
if($f->has_tag('db_xref'))
{
my $crossrefs = join(',',sort $f->each_tag_value('db_xref'));
if($crossrefs =~ /(GI\:\d+)/){ $gi = $1 }
}
elsif($f->has_tag('locus_tag'))
{
if($gi eq '' && $f->has_tag('locus_tag'))
{
$gi = "ID:".join(',',sort $f->each_tag_value('locus_tag'));
}
}
next if($gi eq '');
$start = $f->start();
$end = $f->end();
$strand = $f->location()->strand();
push(@genes,[$start,$end,$gi,$strand]);
}
elsif($f->primary_tag() =~ /source/)
{
if($f->has_tag('organism'))
{
foreach my $element ($f->each_tag_value('organism'))
{
$taxon .= "[$element],";
}
chop $taxon;
}
}
}
my %promotoras;
for($gen=1;$gen<scalar(@genes);$gen++)
{
$n_of_intergenic++;
$start = 0;
$end ;
$length = $end-$start;
$molde=substr($sequence,$start,$length);
print LIDER ">intergenic$n_of_intergenic|$gbaccession|coords:$start..$end|".
"length:$length|($genes[$gen-1][3])".
"|$taxon\n";
print LIDER $molde."\n";
$promotoras{$gen}=$molde;
$lider=substr($sequence,$start,$length);
$revcom= reverse $lider;
$revcom =~ tr/ACGTacgt/TGCAtgca/;
$length = length($revcom);
print COMPLEMENTARIA ">intergenic$n_of_intergenic|$gbaccession|coords:$start..$end|".
"length:$length|($genes[$gen-1][3])".
"|$taxon\n$revcom\n";
$promotoras{$gen}=$revcom;
my %palabra;
foreach my $id (keys %promotoras){
$limite=length($promotoras{$id})-8+1;
for($i=0;$i<$limite;$i++)
{
my $cadena=substr($promotoras{$id},$i,8);
if (exists $palabra{$cadena}){
$palabra{$cadena}++;
}
else{
$palabra{$cadena}=1;
}
}
}
foreach my $elemento(sort {$palabra{$a} <=> $palabra{$b}} keys %palabra)
{
print SALIDA "$elemento \t $palabra{$elemento}\n";
}
}
}
close(FNA);
return $n_of_intergenic;
}
&conteo_palabras_genoma_completo;