in src/main/python/vcf_snpeff_annotation.py [0:0]
def snpsift_annotation(merge,passfile,dbtype,SNPSIFT,anno_db,TRANSVAR_ANNOVERSION):
if dbtype == 'dbnsfp':
cmd = 'echo `date` begin annotate '+dbtype+' variants using SnpSift...\n'
dbnsfp_annofile = passfile+'.dbnsfp.annotation'
cmd += 'java -jar '+SNPSIFT+' dbnsfp -g '+TRANSVAR_ANNOVERSION+' -db '+anno_db+' -f '
cmd += 'SIFT_pred,Polyphen2_HDIV_pred,Polyphen2_HVAR_pred,MutationTaster_pred,'
cmd += 'MutationAssessor_pred,FATHMM_pred,CADD_phred,ESP6500_AA_AF,1000Gp1_AF,ExAC_AF,'
cmd += 'phastCons100way_vertebrate,GERP++_RS,COSMIC_ID,COSMIC_CNT,clinvar_clnsig,clinvar_trait '
cmd += passfile+' > '+dbnsfp_annofile+'\n'
cmd += 'echo `date` finish annotate '+dbtype+' variants using SnpSift.\n'
run(cmd)
annotated_variants=dict()
anno=open(dbnsfp_annofile)
## SIFT D: Deleterious (sift<=0.05); T: tolerated (sift>0.05)
## PolyPhen 2 HDIV D: Probably damaging (>=0.957), P: possibly damaging (0.453<=pp2_hdiv<=0.956); B: benign (pp2_hdiv<=0.452)
## PolyPhen 2 HVar D: Probably damaging (>=0.909), P: possibly damaging (0.447<=pp2_hdiv<=0.909); B: benign (pp2_hdiv<=0.446)
## MutationTaster A ("disease_causing_automatic"); "D" ("disease_causing"); "N" ("polymorphism"); "P" ("polymorphism_automatic")
## MutationAssessor H: high; M: medium; L: low; N: neutral. H/M means functional and L/N means non-functional
## FATHMM D: Deleterious; T: Tolerated
## CADD phred 20 means top 1%, 30 means top 0.1%. percentage = 10^(-CADD phred/10)
## GERP++ higher scores are more deleterious(>2)
## PhyloP higher scores are more deleterious
for value in anno:
if value.startswith("#CHROM"):
data = value.rstrip().split("\t")
for i in range(len(data)):
if data[i]=="#CHROM":
chrom_index = i
elif data[i]=="ID":
dbsnpID_index = i
elif data[i]=="POS":
position_index = i
elif data[i]=="REF":
ref_index = i
elif data[i]=="ALT":
alt_index = i
elif data[i]=="INFO":
info_index = i
if not value.startswith("#"):
data = value.rstrip().split("\t")
anno_variant=data[chrom_index]+"_"+data[position_index]+"_"+data[ref_index]+"_"+data[alt_index]
dbsnp_ID=data[dbsnpID_index]
infor = data[info_index].split(";")
COSMIC_id,COSMIC_count,clinvar_sig,clinvar_trait,\
genome_af,esp6500_af,exac_af,sift_pred,poly2HDIV_pred,poly2HVAR_pred,\
MT_pred,MA_pred,FATHMM_pred,CADD_phred,gerpRS,phastCons100way_mammalian=\
'-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-'
for info in infor:
if len(info.split("="))==1:
pass
elif len(info.split("="))==2:
key,val=info.split("=")
if key=="dbNSFP_1000Gp1_AF":
genome_af=val
elif key=="dbNSFP_ESP6500_AA_AF":
esp6500_af=val
elif key=="dbNSFP_ExAC_AF":
exac_af=val
elif key=="dbNSFP_SIFT_pred":
sift_pred=val
elif key=="dbNSFP_Polyphen2_HDIV_pred":
poly2HDIV_pred=val
elif key=="dbNSFP_Polyphen2_HVAR_pred":
poly2HVAR_pred=val
elif key=="dbNSFP_MutationTaster_pred":
MT_pred=val
elif key=="dbNSFP_MutationAssessor_pred":
MA_pred=val
elif key=="dbNSFP_FATHMM_pred":
FATHMM_pred=val
elif key=="dbNSFP_CADD_phred":
CADD_phred=val
elif key=="dbNSFP_GERP___RS":
gerpRS=val
elif key=="dbNSFP_phastCons100way_vertebrate":
phastCons100way_mammalian=val
elif key=="dbNSFP_COSMIC_ID":
COSMIC_id=val
elif key=="dbNSFP_COSMIC_CNT":
COSMIC_count=val
elif key=="dbNSFP_clinvar_clnsig":
clinvar_sig=val
elif key=="dbNSFP_clinvar_trait":
clinvar_trait=val
if anno_variant not in annotated_variants:
annotated_variants[anno_variant]=[COSMIC_id,COSMIC_count,clinvar_sig,clinvar_trait,
dbsnp_ID,genome_af,esp6500_af,exac_af,sift_pred,poly2HDIV_pred,poly2HVAR_pred,
MT_pred,MA_pred,FATHMM_pred,CADD_phred,gerpRS,phastCons100way_mammalian]
merge_dbtype = open(re.sub('\.annotation','.'+dbtype+'.annotation',merge),'w')
input = open(merge)
header = input.readline()
data = header.rstrip().split("\t")
for i in range(len(data)):
if data[i]=="#CHROM":
chrom_index = i
elif data[i]=="POS":
position_index = i
elif data[i]=="REF":
ref_index = i
elif data[i]=="ALT":
alt_index = i
merge_dbtype.write("%s\tCOSMIC_ID\tCOSMIC_COUNT\tClinvar_sig\tClinvar_Trait\tDBSNP_ID\t1000GP_AF\tESP6500_AF\tEXAC_AF\tSIFT\tPolyPhen2_HDIV\tPolyPhen2_HDAR\tMutationTaster\tMutationAssessor\tFATHMM\tCADD_phred\tGERP++RS\tphastCons100way\n" % header.rstrip())
for data in input:
annotates = data.rstrip().split("\t")
variant = annotates[chrom_index]+"_"+annotates[position_index]+"_"+annotates[ref_index]+"_"+annotates[alt_index]
if variant in annotated_variants:
merge_dbtype.write("%s\t%s\n" %
(data.rstrip(),'\t'.join(annotated_variants[variant])))
else:
merge_dbtype.write("%s\t-\t-\t-\t-\t.\t-\t-\t-\t-\t-\t-\t-\t-\t-\t-\t-\t-\n" % data.rstrip())
merge_dbtype.close()
cmd = 'echo `date` begin remove '+dbtype+' temporary files...\n'
cmd += 'rm -rf '+dbnsfp_annofile+'\n'
cmd += 'rm -rf '+merge+'\n'
cmd += 'echo `date` finish remove '+dbtype+' temporary files...\n'
run(cmd)
merge = re.sub('\.annotation','.'+dbtype+'.annotation',merge)
return (merge)