Hi!
I have a text of fasta files and am trying to split. these are from 4 sources (> 1:, > 2:, > 3:, and > 4
separated into blocks by #. i want to make separate files for each block such as file1 file 2 only if that block has information from all the four resources.
i tried a piece of code but it is far from executable. could you guys sort it out.
Thanks
existing code
#!/usr/bin/perl
open(DATA, "split.txt")\n";
while (<DATA>) {
$line = $_;
chomp($line);
$lineNum = 1;
if ($line =~ /^>/) {
$Name = $line;
$Name =~ s/^\s{1,}>//;
$Name =~ s/^>\s{1,}//;
$Name =~ s/>//;
$Name =~ s/\s.*$//;
$Name = "$Name" . "\.seq";
if ($numSeqs > 0) {
close(OUT_FILE);
}
open(OUT_FILE, ">$Name");
print OUT_FILE ">$Name\n";
}
else {
#$entries = "$entries" . "$line";
print OUT_FILE "$line\n";
} #end else
++$numSeqs;
} #end while
#print OUT_FILE "$entries\n";
close (OUT_FILE);
close (DATA);
print "entries = $numSeqs\n";
} #end Main
Input data
#
> 1:333078-333779
GAATATCCCCATGATCTTTCCCTCAATCGCCCGCTGATAAGTGGGAAGACATCG
GTCGCGCCACACTCGATACCCTGCTCATGGTGGCGCTTGGTCTTCCCTTGGGAAT
> 2:659628-660329
GAATATCCCCATGATCTTCCCCTCAATCGACCTGGACGCTGATAAGTGGAAAGACATCG
GTCGCGCCACACTCGATACCCTGCTCACCCGCATTGGCGCTTGGTCTTCCCTTGGGAAT
> 3:682458-683159
GAATATCCCCATGATCTTCCCCTCAAACCTGGACGTTGATAAGTGGAAAGACATCG
> 4:1630596-1631297
GAATATCCCCATGATCTTTCCCTCAATCGACGCTGATAAGTGGGAAGACAT
GTCGCGCCACACTCGATACCCTGCTCGGCGCTTGGTCTTCCCTTGGGAATC
#
> 1:334683-335218
GGTTGGCGGTGCCGCCCTCGTGCAACCAATCAAGTTTGGTGGCGATGTTG
CACCAACGCTTAGTGTCACCTACTACATCACTAAAAAGTTGAGTTAT
> 2:661233-661768
GGTTGGCGGCGCCGCCCTCGTGCAACCAACAAGTTTGGTGGCGATGTTG
CACCAACGCTAAGTGTAACCTACTACATCAAGGGGCATTACTAAAAAGTT
> 3:681133-681667
AAAATGCAGCACAGAATACTGTCAAGTTTGGTGGCGATGTTG
> 4:1632207-1632742
GGTTGGCGGCGCTGCCCTCGTGCAACCAAAGAAAAGTTTGGTGGCGATGTTG
CACCAACGCTTAGTGTCACCTACTACATCAACGGGTATTACTAAAAAGTTGAG
#
> 1:335667-335823
AATGACCGAAATCAAGGAAGCTTTTGTCCCCCCCAGTGATTGAAGTGCTAGTCG
TTGGCGATACCGTCTCCAAGGGCCAAAGTTTCAACCATGGAAGTACCTTCGTCA
> 2:1731369-1731525
AATGACCGAAATCAAGGAAGCTTTTGTCCCCCCCAGTGATTGAAGTGCTAG
TTGGCGATACCGTCTCCAAGGGCCAAACAACCATGGAAGTACCCTCGTCA
> 3:679065-679221
AATGACCGAAATCAAGGAAGCTTTTGTCGTCCCAGTGATTGAAGTGCTAGTC
TTGGCGATACCGTCTCCAAGGGCCAAAGCAACCATGGAAGTACCCTCGTCA
#
desired output
file 1
> 1:333078-333779
GAATATCCCCATGATCTTTCCCTCAATCGCCCGCTGATAAGTGGGAAGACATCG
GTCGCGCCACACTCGATACCCTGCTCATGGTGGCGCTTGGTCTTCCCTTGGGAAT
> 2:659628-660329
GAATATCCCCATGATCTTCCCCTCAATCGACCTGGACGCTGATAAGTGGAAAGACATCG
GTCGCGCCACACTCGATACCCTGCTCACCCGCATTGGCGCTTGGTCTTCCCTTGGGAAT
> 3:682458-683159
GAATATCCCCATGATCTTCCCCTCAAACCTGGACGTTGATAAGTGGAAAGACATCG
> 4:1630596-1631297
GAATATCCCCATGATCTTTCCCTCAATCGACGCTGATAAGTGGGAAGACAT
GTCGCGCCACACTCGATACCCTGCTCGGCGCTTGGTCTTCCCTTGGGAATC
file 2
> 1:334683-335218
GGTTGGCGGTGCCGCCCTCGTGCAACCAATCAAGTTTGGTGGCGATGTTG
CACCAACGCTTAGTGTCACCTACTACATCACTAAAAAGTTGAGTTAT
> 2:661233-661768
GGTTGGCGGCGCCGCCCTCGTGCAACCAACAAGTTTGGTGGCGATGTTG
CACCAACGCTAAGTGTAACCTACTACATCAAGGGGCATTACTAAAAAGTT
> 3:681133-681667
AAAATGCAGCACAGAATACTGTCAAGTTTGGTGGCGATGTTG
> 4:1632207-1632742
GGTTGGCGGCGCTGCCCTCGTGCAACCAAAGAAAAGTTTGGTGGCGATGTTG
CACCAACGCTTAGTGTCACCTACTACATCAACGGGTATTACTAAAAAGTTGAG
I have a text of fasta files and am trying to split. these are from 4 sources (> 1:, > 2:, > 3:, and > 4
i tried a piece of code but it is far from executable. could you guys sort it out.
Thanks
existing code
#!/usr/bin/perl
open(DATA, "split.txt")\n";
while (<DATA>) {
$line = $_;
chomp($line);
$lineNum = 1;
if ($line =~ /^>/) {
$Name = $line;
$Name =~ s/^\s{1,}>//;
$Name =~ s/^>\s{1,}//;
$Name =~ s/>//;
$Name =~ s/\s.*$//;
$Name = "$Name" . "\.seq";
if ($numSeqs > 0) {
close(OUT_FILE);
}
open(OUT_FILE, ">$Name");
print OUT_FILE ">$Name\n";
}
else {
#$entries = "$entries" . "$line";
print OUT_FILE "$line\n";
} #end else
++$numSeqs;
} #end while
#print OUT_FILE "$entries\n";
close (OUT_FILE);
close (DATA);
print "entries = $numSeqs\n";
} #end Main
Input data
#
> 1:333078-333779
GAATATCCCCATGATCTTTCCCTCAATCGCCCGCTGATAAGTGGGAAGACATCG
GTCGCGCCACACTCGATACCCTGCTCATGGTGGCGCTTGGTCTTCCCTTGGGAAT
> 2:659628-660329
GAATATCCCCATGATCTTCCCCTCAATCGACCTGGACGCTGATAAGTGGAAAGACATCG
GTCGCGCCACACTCGATACCCTGCTCACCCGCATTGGCGCTTGGTCTTCCCTTGGGAAT
> 3:682458-683159
GAATATCCCCATGATCTTCCCCTCAAACCTGGACGTTGATAAGTGGAAAGACATCG
> 4:1630596-1631297
GAATATCCCCATGATCTTTCCCTCAATCGACGCTGATAAGTGGGAAGACAT
GTCGCGCCACACTCGATACCCTGCTCGGCGCTTGGTCTTCCCTTGGGAATC
#
> 1:334683-335218
GGTTGGCGGTGCCGCCCTCGTGCAACCAATCAAGTTTGGTGGCGATGTTG
CACCAACGCTTAGTGTCACCTACTACATCACTAAAAAGTTGAGTTAT
> 2:661233-661768
GGTTGGCGGCGCCGCCCTCGTGCAACCAACAAGTTTGGTGGCGATGTTG
CACCAACGCTAAGTGTAACCTACTACATCAAGGGGCATTACTAAAAAGTT
> 3:681133-681667
AAAATGCAGCACAGAATACTGTCAAGTTTGGTGGCGATGTTG
> 4:1632207-1632742
GGTTGGCGGCGCTGCCCTCGTGCAACCAAAGAAAAGTTTGGTGGCGATGTTG
CACCAACGCTTAGTGTCACCTACTACATCAACGGGTATTACTAAAAAGTTGAG
#
> 1:335667-335823
AATGACCGAAATCAAGGAAGCTTTTGTCCCCCCCAGTGATTGAAGTGCTAGTCG
TTGGCGATACCGTCTCCAAGGGCCAAAGTTTCAACCATGGAAGTACCTTCGTCA
> 2:1731369-1731525
AATGACCGAAATCAAGGAAGCTTTTGTCCCCCCCAGTGATTGAAGTGCTAG
TTGGCGATACCGTCTCCAAGGGCCAAACAACCATGGAAGTACCCTCGTCA
> 3:679065-679221
AATGACCGAAATCAAGGAAGCTTTTGTCGTCCCAGTGATTGAAGTGCTAGTC
TTGGCGATACCGTCTCCAAGGGCCAAAGCAACCATGGAAGTACCCTCGTCA
#
desired output
file 1
> 1:333078-333779
GAATATCCCCATGATCTTTCCCTCAATCGCCCGCTGATAAGTGGGAAGACATCG
GTCGCGCCACACTCGATACCCTGCTCATGGTGGCGCTTGGTCTTCCCTTGGGAAT
> 2:659628-660329
GAATATCCCCATGATCTTCCCCTCAATCGACCTGGACGCTGATAAGTGGAAAGACATCG
GTCGCGCCACACTCGATACCCTGCTCACCCGCATTGGCGCTTGGTCTTCCCTTGGGAAT
> 3:682458-683159
GAATATCCCCATGATCTTCCCCTCAAACCTGGACGTTGATAAGTGGAAAGACATCG
> 4:1630596-1631297
GAATATCCCCATGATCTTTCCCTCAATCGACGCTGATAAGTGGGAAGACAT
GTCGCGCCACACTCGATACCCTGCTCGGCGCTTGGTCTTCCCTTGGGAATC
file 2
> 1:334683-335218
GGTTGGCGGTGCCGCCCTCGTGCAACCAATCAAGTTTGGTGGCGATGTTG
CACCAACGCTTAGTGTCACCTACTACATCACTAAAAAGTTGAGTTAT
> 2:661233-661768
GGTTGGCGGCGCCGCCCTCGTGCAACCAACAAGTTTGGTGGCGATGTTG
CACCAACGCTAAGTGTAACCTACTACATCAAGGGGCATTACTAAAAAGTT
> 3:681133-681667
AAAATGCAGCACAGAATACTGTCAAGTTTGGTGGCGATGTTG
> 4:1632207-1632742
GGTTGGCGGCGCTGCCCTCGTGCAACCAAAGAAAAGTTTGGTGGCGATGTTG
CACCAACGCTTAGTGTCACCTACTACATCAACGGGTATTACTAAAAAGTTGAG