Tek-Tips is the largest IT community on the Internet today!

Members share and learn making Tek-Tips Forums the best source of peer-reviewed technical information on the Internet!

  • Congratulations derfloh on being selected by the Tek-Tips community for having the most helpful posts in the forums last week. Way to Go!

bioinformatic problem urgent!

Status
Not open for further replies.

0210828176

Programmer
Joined
Apr 23, 2012
Messages
1
Hi I am new to this forum. I had a query regarding converting DNA code to proteins... which is simple by explanation but I find it hard to get accurate results with code. For ex: In a code like ATGTACTAT (here every 3 non overlapping alphabets get replaced by a single alphabet. For ex: ATG -> M; TAC-> Y; TAT -> Y. I am very new to AWK, I tried making a code below but it doesnt work accurately. can u fix it? thanks in advance



awk 'BEGIN{
c["ATG"]="M"; c["TTT"]="F"; c["TTC"]="F"; c["TTA"]="L"; c["TTG"]="L"; c["CTT"]="L"; c["CTC"]="L"; c["CTA"]="L"; c["CTG"]="L"; c["ATT"]="I"; c["ATC"]="I";
c["ATA"]="I"; c["GTT"]="V"; c["GTC"]="V"; c["GTA"]="V"; c["GTG"]="V"; c["TCT"]="S"; c["TCC"]="S"; c["TCA"]="S"; c["TCG"]="S"; c["CCT"]="P"; c["CCC"]="P";
c["CCA"]="P"; c["CCG"]="P"; c["ACT"]="T"; c["ACC"]="T"; c["ACA"]="T"; c["ACG"]="T"; c["GCT"]="A"; c["GCC"]="A"; c["GCA"]="A"; c["GCG"]="A";c["TAT"]="Y";
c["TAC"]="Y"; c["CAT"]="H"; c["CAC"]="H"; c["CAA"]="Q"; c["CAG"]="Q"; c["AAT"]="N"; c["AAC"]="N"; c["AAA"]="K"; c["AAG"]="K"; c["GAT"]="D"; c["GAC"]="D";
c["GAA"]="E"; c["GAG"]="E"; c["TGT"]="C"; c["TGC"]="C"; c["TGG"]="W"; c["CGT"]=R; c["CGC"]=R; c["CGA"]=R; c["CGG"]=R; c["AGA"]=R; c["AGG"]=R; c["AGT"]="S";
c["AGC"]="S"; c["GGT"]="G"; c["GGC"]="G"; c["GGA"]="G"; c["GGG"]="G";}
{i=1; p=""}
{do {
s=substr($0,i,3)
printf ("%s",s)
{if (c==""){p=p" "} else {p=p c" "}}
i=i+3}
while (s!="")}
{printf("\n%s\n",p)} ' genes_contig0028.txt


 
Hi

0210828176 said:
I tried making a code below but it doesnt work accurately.
Supposing the input is that "ATGTACTAT" you specified, your code gives the result "M Y Y". As far as I understood, that is exactly what you expect.

If I got it wrong, please post some sample input and the desired output.

Maybe also specify the used Awk implementation and version.

Feherke.
 
Just for curiosity I tried it to code:
Code:
[COLOR=#0000ff]#! /bin/awk -f[/color]
[COLOR=#6a5acd]BEGIN[/color]{
  c[[COLOR=#6a5acd]"ATG"[/color]]=[COLOR=#ff00ff]"M"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"TTT"[/color]]=[COLOR=#ff00ff]"F"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"TTC"[/color]]=[COLOR=#ff00ff]"F"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"TTA"[/color]]=[COLOR=#ff00ff]"L"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"TTG"[/color]]=[COLOR=#ff00ff]"L"[/color][COLOR=#6a5acd];[/color] 
  c[[COLOR=#6a5acd]"CTT"[/color]]=[COLOR=#ff00ff]"L"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"CTC"[/color]]=[COLOR=#ff00ff]"L"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"CTA"[/color]]=[COLOR=#ff00ff]"L"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"CTG"[/color]]=[COLOR=#ff00ff]"L"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"ATT"[/color]]=[COLOR=#ff00ff]"I"[/color][COLOR=#6a5acd];[/color] 
  c[[COLOR=#6a5acd]"ATC"[/color]]=[COLOR=#ff00ff]"I"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"ATA"[/color]]=[COLOR=#ff00ff]"I"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"GTT"[/color]]=[COLOR=#ff00ff]"V"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"GTC"[/color]]=[COLOR=#ff00ff]"V"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"GTA"[/color]]=[COLOR=#ff00ff]"V"[/color][COLOR=#6a5acd];[/color] 
  c[[COLOR=#6a5acd]"GTG"[/color]]=[COLOR=#ff00ff]"V"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"TCT"[/color]]=[COLOR=#ff00ff]"S"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"TCC"[/color]]=[COLOR=#ff00ff]"S"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"TCA"[/color]]=[COLOR=#ff00ff]"S"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"TCG"[/color]]=[COLOR=#ff00ff]"S"[/color][COLOR=#6a5acd];[/color] 
  c[[COLOR=#6a5acd]"CCT"[/color]]=[COLOR=#ff00ff]"P"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"CCC"[/color]]=[COLOR=#ff00ff]"P"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"CCA"[/color]]=[COLOR=#ff00ff]"P"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"CCG"[/color]]=[COLOR=#ff00ff]"P"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"ACT"[/color]]=[COLOR=#ff00ff]"T"[/color][COLOR=#6a5acd];[/color] 
  c[[COLOR=#6a5acd]"ACC"[/color]]=[COLOR=#ff00ff]"T"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"ACA"[/color]]=[COLOR=#ff00ff]"T"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"ACG"[/color]]=[COLOR=#ff00ff]"T"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"GCT"[/color]]=[COLOR=#ff00ff]"A"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"GCC"[/color]]=[COLOR=#ff00ff]"A"[/color][COLOR=#6a5acd];[/color] 
  c[[COLOR=#6a5acd]"GCA"[/color]]=[COLOR=#ff00ff]"A"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"GCG"[/color]]=[COLOR=#ff00ff]"A"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"TAT"[/color]]=[COLOR=#ff00ff]"Y"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"TAC"[/color]]=[COLOR=#ff00ff]"Y"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"CAT"[/color]]=[COLOR=#ff00ff]"H"[/color][COLOR=#6a5acd];[/color] 
  c[[COLOR=#6a5acd]"CAC"[/color]]=[COLOR=#ff00ff]"H"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"CAA"[/color]]=[COLOR=#ff00ff]"Q"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"CAG"[/color]]=[COLOR=#ff00ff]"Q"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"AAT"[/color]]=[COLOR=#ff00ff]"N"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"AAC"[/color]]=[COLOR=#ff00ff]"N"[/color][COLOR=#6a5acd];[/color] 
  c[[COLOR=#6a5acd]"AAA"[/color]]=[COLOR=#ff00ff]"K"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"AAG"[/color]]=[COLOR=#ff00ff]"K"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"GAT"[/color]]=[COLOR=#ff00ff]"D"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"GAC"[/color]]=[COLOR=#ff00ff]"D"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"GAA"[/color]]=[COLOR=#ff00ff]"E"[/color][COLOR=#6a5acd];[/color] 
  c[[COLOR=#6a5acd]"GAG"[/color]]=[COLOR=#ff00ff]"E"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"TGT"[/color]]=[COLOR=#ff00ff]"C"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"TGC"[/color]]=[COLOR=#ff00ff]"C"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"TGG"[/color]]=[COLOR=#ff00ff]"W"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"CGT"[/color]]=[COLOR=#ff00ff]"R"[/color][COLOR=#6a5acd];[/color] 
  c[[COLOR=#6a5acd]"CGC"[/color]]=[COLOR=#ff00ff]"R"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"CGA"[/color]]=[COLOR=#ff00ff]"R"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"CGG"[/color]]=[COLOR=#ff00ff]"R"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"AGA"[/color]]=[COLOR=#ff00ff]"R"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"AGG"[/color]]=[COLOR=#ff00ff]"R"[/color][COLOR=#6a5acd];[/color] 
  c[[COLOR=#6a5acd]"AGT"[/color]]=[COLOR=#ff00ff]"S"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"AGC"[/color]]=[COLOR=#ff00ff]"S"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"GGT"[/color]]=[COLOR=#ff00ff]"G"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"GGC"[/color]]=[COLOR=#ff00ff]"G"[/color][COLOR=#6a5acd];[/color] c[[COLOR=#6a5acd]"GGA"[/color]]=[COLOR=#ff00ff]"G"[/color][COLOR=#6a5acd];[/color] 
  c[[COLOR=#6a5acd]"GGG"[/color]]=[COLOR=#ff00ff]"G"[/color][COLOR=#6a5acd];[/color]
}

{ 
  old_line = [COLOR=#6a5acd]$0[/color]
  new_line = [COLOR=#ff00ff]""[/color]
  char3 = [COLOR=#ff00ff]"xxx"[/color]
  i = [COLOR=#ff00ff]1[/color]
  [COLOR=#804040][b]while[/b][/color] (char3) {
    [COLOR=#0000ff]# get 3 chars from line[/color]
    char3 = [COLOR=#008080]substr[/color](old_line[COLOR=#6a5acd],[/color] i[COLOR=#6a5acd],[/color] [COLOR=#ff00ff]3[/color])
    [COLOR=#804040][b]if[/b][/color] (char3) {
      [COLOR=#0000ff]#printf "char3 = '%s'\n", char3[/color]
      [COLOR=#804040][b]if[/b][/color] (char3 in c) {
        new_line = new_line c[[COLOR=#6a5acd]char3[/color]]
        [COLOR=#0000ff]#printf "* new_line = '%s'\n", new_line[/color]
      } 
      [COLOR=#804040][b]else[/b][/color] {
        [COLOR=#804040][b]printf[/b][/color] [COLOR=#ff00ff]"* Error: key '[/color][COLOR=#6a5acd]%s[/color][COLOR=#ff00ff]' not found in array c ![/color][COLOR=#6a5acd]\n[/color][COLOR=#ff00ff]"[/color][COLOR=#6a5acd],[/color] char3 
      }
    }
    [COLOR=#0000ff]# move to the next 3 chars[/color]
    i += [COLOR=#ff00ff]3[/color]  
  }
  [COLOR=#804040][b]printf[/b][/color] [COLOR=#ff00ff]"old: '[/color][COLOR=#6a5acd]%s[/color][COLOR=#ff00ff]' ==> new: '[/color][COLOR=#6a5acd]%s[/color][COLOR=#ff00ff]'[/color][COLOR=#6a5acd]\n[/color][COLOR=#ff00ff]"[/color][COLOR=#6a5acd],[/color] old_line[COLOR=#6a5acd],[/color] new_line  
}
For following example data
Code:
ACTCGCTAT
GCGTGGAAA
TACGAGACT
it outputs
Code:
old: 'ACTCGCTAT' ==> new: 'TRY'
old: 'GCGTGGAAA' ==> new: 'AWK'
old: 'TACGAGACT' ==> new: 'YET'
 
Replace this:
c["CGT"]=R; c["CGC"]=R; c["CGA"]=R; c["CGG"]=R; c["AGA"]=R; c["AGG"]=R
with this:
c["CGT"]="R"; c["CGC"]="R"; c["CGA"]="R"; c["CGG"]="R"; c["AGA"]="R"; c["AGG"]="R"

Hope This Helps, PH.
FAQ219-2884
FAQ181-2886
 
Anywy, a simpler way for OP's code:
Code:
awk 'BEGIN{
c["ATG"]="M"
c["TTT"]=c["TTC"]="F"
c["TTA"]=c["TTG"]=c["CTT"]=c["CTC"]=c["CTA"]=c["CTG"]="L"
c["ATT"]=c["ATC"]=c["ATA"]="I"
c["GTT"]=c["GTC"]=c["GTA"]=c["GTG"]="V"
c["TCT"]=c["TCC"]=c["TCA"]=c["TCG"]=c["AGT"]=c["AGC"]="S"
 c["CCT"]=c["CCC"]=c["CCA"]=c["CCG"]="P"
c["ACT"]=c["ACC"]=c["ACA"]=c["ACG"]="T"
c["GCT"]=c["GCC"]=c["GCA"]=c["GCG"]="A"
c["TAT"]=c["TAC"]="Y"
c["CAT"]=c["CAC"]="H"
c["CAA"]=c["CAG"]="Q"
c["AAT"]=c["AAC"]="N"
c["AAA"]=c["AAG"]="K"
c["GAT"]=c["GAC"]="D"
c["GAA"]=c["GAG"]="E"
c["TGT"]=c["TGC"]="C"
c["TGG"]="W"
c["CGT"]=c["CGC"]=c["CGA"]=c["CGG"]=c["AGA"]=c["AGG"]="R"
c["GGT"]=c["GGC"]=c["GGA"]=c["GGG"]="G"
}
{print;p="";for(i=1;i<=length($0);i+=3)p=p c[substr($0,i,3)]" ";print p}
' genes_contig0028.txt

Hope This Helps, PH.
FAQ219-2884
FAQ181-2886
 
Status
Not open for further replies.

Part and Inventory Search

Sponsor

Back
Top