首先,下載病毒基因組信息。
登錄 GenBank,網址:
www.ncbi.nlm.nih.gov/genbank/
查詢輸入:MN908947
MN908947 為我國復旦大學上海公共衛生臨床中心和公共衛生學院於今年1月5日上傳的新冠狀病毒全基因組數據。下載fasta文件,另存為:MN908947.fa。
構建讀入基因組函數:
def loadCoronaVirusDNA(filename):
f = open(filename)
dna=''
for line in f:
if line[0] != '>':
dna += line.rstrip()
else:
header = line.split()
name = header[0][1:]
return name, dna
coronavirusname, coronavirusdna = loadCoronaVirusDNA('MN908947.fa')
codon = {'ATT': ('Isoleucine', 'Ile', 'I'),
'ATC': ('Isoleucine', 'Ile', 'I'),
'ATA': ('Isoleucine', 'Ile', 'I'),
'CTT': ('Leucine', 'Leu', 'L'),
'CTC': ('Leucine', 'Leu', 'L'),
'CTA': ('Leucine', 'Leu', 'L'),
'CTG': ('Leucine', 'Leu', 'L'),
'TTA': ('Leucine', 'Leu', 'L'),
'TTG': ('Leucine', 'Leu', 'L'),
'GTT': ('Valine', 'Val', 'V'),
'GTC': ('Valine', 'Val', 'V'),
'GTA': ('Valine', 'Val', 'V'),
'GTG': ('Valine', 'Val', 'V'),
'TTT': ('Phenylalanine', 'Phe', 'F'),
'TTC': ('Phenylalanine', 'Phe', 'F'),
'ATG': ('Methionine', 'Met', 'M'),
'TGT': ('Cysteine ', 'Cys', 'C'),
'TGC': ('Cysteine ', 'Cys', 'C'),
'GCT': ('Alanine', 'Ala', 'A'),
'GCC': ('Alanine', 'Ala', 'A'),
'GCA': ('Alanine', 'Ala', 'A'),
'GCG': ('Alanine', 'Ala', 'A'),
'GGT': ('Glycine ', 'Gly', 'G'),
'GGC': ('Glycine ', 'Gly', 'G'),
'GGA': ('Glycine ', 'Gly', 'G'),
'GGG': ('Glycine ', 'Gly', 'G'),
'CCT': ('Proline', 'Pro', 'P'),
'CCC': ('Proline', 'Pro', 'P'),
'CCA': ('Proline', 'Pro', 'P'),
'CCG': ('Proline', 'Pro', 'P'),
'ACT': ('Threonine', 'Thr', 'T'),
'ACC': ('Threonine', 'Thr', 'T'),
'ACA': ('Threonine', 'Thr', 'T'),
'ACG': ('Threonine', 'Thr', 'T'),
'TCT': ('Serine', 'Ser', 'S'),
'TCC': ('Serine', 'Ser', 'S'),
'TCA': ('Serine', 'Ser', 'S'),
'TCG': ('Serine', 'Ser', 'S'),
'AGT': ('Serine', 'Ser', 'S'),
'AGC': ('Serine', 'Ser', 'S'),
'TAT': ('Tyrosine', 'Tyr', 'Y'),
'TAC': ('Tyrosine', 'Tyr', 'Y'),
'TGG': ('Tryptophan', 'Trp', 'W'),
'CAA': ('Glutamine', 'Gln', 'Q'),
'CAG': ('Glutamine', 'Gln', 'Q'),
'AAT': ('Asparagine', 'Asn', 'N'),
'AAC': ('Asparagine', 'Asn', 'N'),
'CAT': ('Histidine ', 'His', 'H'),
'CAC': ('Histidine ', 'His', 'H'),
'GAA': ('Glutamic acid', 'Glu', 'E'),
'GAG': ('Glutamic acid', 'Glu', 'E'),
'GAT': ('Aspartic acid', 'Asp', 'D'),
'GAC': ('Aspartic acid', 'Asp', 'D'),
'AAA': ('Lysine', 'Lys', 'K'),
'AAG': ('Lysine', 'Lys', 'K'),
'CGT': ('Arginine', 'Arg', 'R'),
'CGC': ('Arginine', 'Arg', 'R'),
'CGA': ('Arginine', 'Arg', 'R'),
'CGG': ('Arginine', 'Arg', 'R'),
'AGA': ('Arginine', 'Arg', 'R'),
'AGG': ('Arginine', 'Arg', 'R'),
'TAA': ('Stop', 'Stop', '___'),
'TAG': ('Stop', 'Stop', '___'),
'TGA': ('Stop', 'Stop', '___')
}
構建翻譯函數:
def translate(dna):
aminoacid = ''
amino = ''
a = ''
i = 0
dna_len = len(dna)
DNA = dna.upper()
while(1):
if i+3 <= dna_len:
c = DNA[i:i+3]
residue = codon[c][0]
res = codon[c][1]
r = codon[c][2]
if i == 0:
aminoacid = residue
amino = res
a = a + r
else:
aminoacid = aminoacid + '-' + residue
amino = amino + '-' + res
a = a + r
i = i + 3
else:
break
return aminoacid, amino, a
說明定位是 21563 到 25384,。注意,在Python中,由於 slice 是從 0 開始計數,所以引用 index 則要減 1,則為 21562 到 25383:
spike_dna = coronavirusdna[21562:25383]
spike_protein_tuple = translate(spike_dna)
spike_protein = spike_protein_tuple[2]
應用BioPython解析和可視化蛋白質的結構
傳染病模型的Python實現
如何安裝和使用免費的蛋白質空間結構可視化軟體 PyMol
關注Chuan Workstation
獲得更多臨床醫學和
生物信息學新技術和新進展