in src/main/python/vcf_annovar_annotation.py [0:0]
def annovar_annotation(merge,annovar_passfile,vcf2annovar_var_match,dbtype,refversion):
if dbtype == 'multiscore':
cmd = 'echo `date` begin annotate '+dbtype+' variants using ANNOVAR...\n'
cmd += 'perl '+ANNOVAR_TABLE+' -buildver '+refversion+' -protocol '+db[dbtype]+' '
cmd += annovar_passfile+' '+ANNOVAR_DB+' -operation f -nastring - \n'
dropped = annovar_passfile+'.'+refversion+'_'+outname[dbtype]+'_dropped'
filtered = annovar_passfile+'.'+refversion+'_'+outname[dbtype]+'_filtered'
log = annovar_passfile+'.log'
invalid = annovar_passfile+'.invalid_input'
all = annovar_passfile+'.'+refversion+'_multianno.txt'
cmd += 'echo `date` finish annotate '+dbtype+' variants using ANNOVAR...\n'
run(cmd)
annotated_variants=dict()
anno=open(all)
header = anno.readline()
## SIFT D: Deleterious (sift<=0.05); T: tolerated (sift>0.05)
## PolyPhen 2 HDIV D: Probably damaging (>=0.957), P: possibly damaging (0.453<=pp2_hdiv<=0.956); B: benign (pp2_hdiv<=0.452)
## PolyPhen 2 HVar D: Probably damaging (>=0.909), P: possibly damaging (0.447<=pp2_hdiv<=0.909); B: benign (pp2_hdiv<=0.446)
## MutationTaster A ("disease_causing_automatic"); "D" ("disease_causing"); "N" ("polymorphism"); "P" ("polymorphism_automatic")
## MutationAssessor H: high; M: medium; L: low; N: neutral. H/M means functional and L/N means non-functional
## FATHMM D: Deleterious; T: Tolerated
## VEST3 higher scores are more deleterious
## CADD raw higher scores are more significant(>4)
## CADD phred 20 means top 1%, 30 means top 0.1%. percentage = 10^(-CADD phred/10)
## GERP++ higher scores are more deleterious(>2)
## PhyloP higher scores are more deleterious
for data in anno:
annotates = data.rstrip().split("\t")
chr,start,end,ref,alt=annotates[:5]
sift_score,sift_pred=annotates[5:7]
poly2HDIV_score,poly2HDIV_pred=annotates[7:9]
poly2HVAR_score,poly2HVAR_pred=annotates[9:11]
MT_score,MT_pred=annotates[13:15]
MA_score,MA_pred=annotates[15:17]
FATHMM_score,FATHMM_pred=annotates[17:19]
VEST3score=annotates[21]
CADD_score,CADD_phred=annotates[22:24]
gerpRS=annotates[33]
phyloP20way_mammalian=annotates[35]
phastCons20way_mammalian=annotates[37]
anno_variant=chr+':'+start+'_'+end+ref+'/'+alt
if anno_variant not in annotated_variants:
annotated_variants[anno_variant]=[sift_score,sift_pred,poly2HDIV_score,poly2HDIV_pred,poly2HVAR_score,poly2HVAR_pred,
MT_score,MT_pred,MA_score,MA_pred,
FATHMM_score,FATHMM_pred,VEST3score,CADD_score,CADD_phred,gerpRS,
phyloP20way_mammalian,phastCons20way_mammalian]
merge_dbtype = open(re.sub('\.annotation','.'+dbtype+'.annotation',merge),'w')
input = open(merge)
header = input.readline()
data = header.rstrip().split("\t")
for i in range(len(data)):
if data[i]=="#CHROM":
chrom_index = i
elif data[i]=="POS":
position_index = i
elif data[i]=="REF":
ref_index = i
elif data[i]=="ALT":
alt_index = i
merge_dbtype.write("%s\tSIFT\tPolyPhen2_HDIV\tPolyPhen2_HDAR\tMutationTaster\tMutationAssessor\tFATHMM\tVEST3\tCADD_raw\tCADD_phred\tGERP++\tphyloP20way\tphastCons20way\n" % header.rstrip())
for data in input:
annotates = data.rstrip().split("\t")
variant = annotates[chrom_index]+"_"+annotates[position_index]+"_"+annotates[ref_index]+"_"+annotates[alt_index]
if variant in vcf2annovar_var_match:
anno_variant = vcf2annovar_var_match[variant]
if anno_variant in annotated_variants:
sift_score,sift_pred,poly2HDIV_score,poly2HDIV_pred,poly2HVAR_score,poly2HVAR_pred,MT_score,MT_pred,MA_score,MA_pred,FATHMM_score,FATHMM_pred,VEST3score,CADD_score,CADD_phred,gerpRS,phyloP20way_mammalian,phastCons20way_mammalian = annotated_variants[anno_variant]
merge_dbtype.write("%s\t%s(%s)\t%s(%s)\t%s(%s)\t%s(%s)\t%s(%s)\t%s(%s)\t%s\t%s\t%s\t%s\t%s\t%s\n" %
(data.rstrip(),sift_score,sift_pred,poly2HDIV_score,poly2HDIV_pred,poly2HVAR_score,poly2HVAR_pred,
MT_score,MT_pred,MA_score,MA_pred,FATHMM_score,FATHMM_pred,VEST3score,CADD_score,CADD_phred,gerpRS,
phyloP20way_mammalian,phastCons20way_mammalian))
else:
merge_dbtype.write("%s\t-(-)\t-(-)\t-(-)\t-(-)\t-(-)\t-(-)\t-\t-\t-\t-\t-\t-\n" % data.rstrip())
else:
merge_dbtype.write("%s\t-(-)\t-(-)\t-(-)\t-(-)\t-(-)\t-(-)\t-\t-\t-\t-\t-\t-\n" % data.rstrip())
merge_dbtype.close()
cmd = 'echo `date` begin remove '+dbtype+' temporary files...\n'
cmd += 'rm -rf '+dropped+'\n'
cmd += 'rm -rf '+filtered+'\n'
cmd += 'rm -rf '+log+'\n'
cmd += 'rm -rf '+invalid+'\n'
cmd += 'rm -rf '+all+'\n'
cmd += 'rm -rf '+merge+'\n'
cmd += 'echo `date` finish remove '+dbtype+' temporary files...\n'
run(cmd)
merge = re.sub('\.annotation','.'+dbtype+'.annotation',merge)
else:
cmd = 'echo `date` begin annotate '+dbtype+' variants using ANNOVAR...\n'
cmd += 'perl '+ANNOVAR_ANNO+' -buildver '+refversion+' -filter -dbtype '+db[dbtype]+' '
cmd += annovar_passfile+' '+ANNOVAR_DB+'\n'
dropped = annovar_passfile+'.'+refversion+'_'+outname[dbtype]+'_dropped'
filtered = annovar_passfile+'.'+refversion+'_'+outname[dbtype]+'_filtered'
log = annovar_passfile+'.log'
invalid = annovar_passfile+'.invalid_input'
cmd += 'echo `date` finish annotate '+dbtype+' variants using ANNOVAR...\n'
run(cmd)
annotated_variants=dict()
for data in open(dropped):
annotates = data.rstrip().split("\t")
score,chr,start,end,ref,alt=annotates[1:7]
anno_variant=chr+':'+start+'_'+end+ref+'/'+alt
if anno_variant not in annotated_variants:
annotated_variants[anno_variant]=score
merge_dbtype = open(re.sub('\.annotation','.'+dbtype+'.annotation',merge),'w')
input = open(merge)
header = input.readline()
data = header.rstrip().split("\t")
for i in range(len(data)):
if data[i]=="#CHROM":
chrom_index = i
elif data[i]=="POS":
position_index = i
elif data[i]=="REF":
ref_index = i
elif data[i]=="ALT":
alt_index = i
merge_dbtype.write("%s\t%s\n" % (header.rstrip(),alias[dbtype]))
for data in input:
annotates = data.rstrip().split("\t")
variant = annotates[chrom_index]+"_"+annotates[position_index]+"_"+annotates[ref_index]+"_"+annotates[alt_index]
if variant in vcf2annovar_var_match:
anno_variant = vcf2annovar_var_match[variant]
if anno_variant in annotated_variants:
merge_dbtype.write("%s\t%s\n" % (data.rstrip(),annotated_variants[anno_variant]))
else:
merge_dbtype.write("%s\t-\n" % data.rstrip())
else:
merge_dbtype.write("%s\t-\n" % data.rstrip())
merge_dbtype.close()
cmd = 'echo `date` begin remove '+dbtype+' temporary files...\n'
cmd += 'rm -rf '+dropped+'\n'
cmd += 'rm -rf '+filtered+'\n'
cmd += 'rm -rf '+log+'\n'
cmd += 'rm -rf '+invalid+'\n'
cmd += 'rm -rf '+merge+'\n'
cmd += 'echo `date` finish remove '+dbtype+' temporary files...\n'
run(cmd)
merge = re.sub('\.annotation','.'+dbtype+'.annotation',merge)
return (merge)