def snpsift_annotation()

in src/main/python/vcf_snpeff_annotation.py [0:0]


def snpsift_annotation(merge,passfile,dbtype,SNPSIFT,anno_db,TRANSVAR_ANNOVERSION):
		
	if dbtype == 'dbnsfp':
		cmd = 'echo `date` begin annotate '+dbtype+' variants using SnpSift...\n'
		dbnsfp_annofile = passfile+'.dbnsfp.annotation'
		cmd += 'java -jar '+SNPSIFT+' dbnsfp -g '+TRANSVAR_ANNOVERSION+' -db '+anno_db+' -f '
		cmd += 'SIFT_pred,Polyphen2_HDIV_pred,Polyphen2_HVAR_pred,MutationTaster_pred,'
		cmd += 'MutationAssessor_pred,FATHMM_pred,CADD_phred,ESP6500_AA_AF,1000Gp1_AF,ExAC_AF,'
		cmd += 'phastCons100way_vertebrate,GERP++_RS,COSMIC_ID,COSMIC_CNT,clinvar_clnsig,clinvar_trait '
		cmd += passfile+' > '+dbnsfp_annofile+'\n'
		cmd += 'echo `date` finish annotate '+dbtype+' variants using SnpSift.\n'
		run(cmd)
		
		annotated_variants=dict()
		anno=open(dbnsfp_annofile)
		## SIFT                D: Deleterious (sift<=0.05); T: tolerated (sift>0.05)
		## PolyPhen 2 HDIV     D: Probably damaging (>=0.957), P: possibly damaging (0.453<=pp2_hdiv<=0.956); B: benign (pp2_hdiv<=0.452)
		## PolyPhen 2 HVar     D: Probably damaging (>=0.909), P: possibly damaging (0.447<=pp2_hdiv<=0.909); B: benign (pp2_hdiv<=0.446)
		## MutationTaster      A ("disease_causing_automatic"); "D" ("disease_causing"); "N" ("polymorphism"); "P" ("polymorphism_automatic")
		## MutationAssessor    H: high; M: medium; L: low; N: neutral. H/M means functional and L/N means non-functional
		## FATHMM              D: Deleterious; T: Tolerated
		## CADD phred          20 means top 1%, 30 means top 0.1%. percentage = 10^(-CADD phred/10)
		## GERP++              higher scores are more deleterious(>2)
		## PhyloP              higher scores are more deleterious
		for value in anno:
			if value.startswith("#CHROM"):
				data = value.rstrip().split("\t")
				for i in range(len(data)):
					if data[i]=="#CHROM":
						chrom_index = i
					elif data[i]=="ID":
						dbsnpID_index = i
					elif data[i]=="POS":
						position_index = i
					elif data[i]=="REF":
						ref_index = i
					elif data[i]=="ALT":
						alt_index = i
					elif data[i]=="INFO":
						info_index = i
			if not value.startswith("#"):
				data = value.rstrip().split("\t")
				anno_variant=data[chrom_index]+"_"+data[position_index]+"_"+data[ref_index]+"_"+data[alt_index]
				dbsnp_ID=data[dbsnpID_index]
				infor = data[info_index].split(";")
				COSMIC_id,COSMIC_count,clinvar_sig,clinvar_trait,\
				genome_af,esp6500_af,exac_af,sift_pred,poly2HDIV_pred,poly2HVAR_pred,\
				MT_pred,MA_pred,FATHMM_pred,CADD_phred,gerpRS,phastCons100way_mammalian=\
					'-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-'
				for info in infor:
					if len(info.split("="))==1:
						pass
					elif len(info.split("="))==2:
						key,val=info.split("=")
						if key=="dbNSFP_1000Gp1_AF":
							genome_af=val
						elif key=="dbNSFP_ESP6500_AA_AF":
							esp6500_af=val
						elif key=="dbNSFP_ExAC_AF":
							exac_af=val
						elif key=="dbNSFP_SIFT_pred":
							sift_pred=val
						elif key=="dbNSFP_Polyphen2_HDIV_pred":
							poly2HDIV_pred=val
						elif key=="dbNSFP_Polyphen2_HVAR_pred":
							poly2HVAR_pred=val
						elif key=="dbNSFP_MutationTaster_pred":
							MT_pred=val
						elif key=="dbNSFP_MutationAssessor_pred":
							MA_pred=val
						elif key=="dbNSFP_FATHMM_pred":
							FATHMM_pred=val
						elif key=="dbNSFP_CADD_phred":
							CADD_phred=val
						elif key=="dbNSFP_GERP___RS":
							gerpRS=val
						elif key=="dbNSFP_phastCons100way_vertebrate":
							phastCons100way_mammalian=val
						elif key=="dbNSFP_COSMIC_ID":
							COSMIC_id=val
						elif key=="dbNSFP_COSMIC_CNT":
							COSMIC_count=val
						elif key=="dbNSFP_clinvar_clnsig":
							clinvar_sig=val
						elif key=="dbNSFP_clinvar_trait":
							clinvar_trait=val
				if anno_variant not in annotated_variants:
					annotated_variants[anno_variant]=[COSMIC_id,COSMIC_count,clinvar_sig,clinvar_trait,
										dbsnp_ID,genome_af,esp6500_af,exac_af,sift_pred,poly2HDIV_pred,poly2HVAR_pred,
										MT_pred,MA_pred,FATHMM_pred,CADD_phred,gerpRS,phastCons100way_mammalian]
		
		merge_dbtype = open(re.sub('\.annotation','.'+dbtype+'.annotation',merge),'w')
		input = open(merge)
		header = input.readline()
		data = header.rstrip().split("\t")
		for i in range(len(data)):
			if data[i]=="#CHROM":
				chrom_index = i
			elif data[i]=="POS":
				position_index = i
			elif data[i]=="REF":
				ref_index = i
			elif data[i]=="ALT":
				alt_index = i
		merge_dbtype.write("%s\tCOSMIC_ID\tCOSMIC_COUNT\tClinvar_sig\tClinvar_Trait\tDBSNP_ID\t1000GP_AF\tESP6500_AF\tEXAC_AF\tSIFT\tPolyPhen2_HDIV\tPolyPhen2_HDAR\tMutationTaster\tMutationAssessor\tFATHMM\tCADD_phred\tGERP++RS\tphastCons100way\n" % header.rstrip())
		for data in input:
			annotates = data.rstrip().split("\t")
			variant = annotates[chrom_index]+"_"+annotates[position_index]+"_"+annotates[ref_index]+"_"+annotates[alt_index]
			if variant in annotated_variants:
				merge_dbtype.write("%s\t%s\n" %
									(data.rstrip(),'\t'.join(annotated_variants[variant])))
			else:
				merge_dbtype.write("%s\t-\t-\t-\t-\t.\t-\t-\t-\t-\t-\t-\t-\t-\t-\t-\t-\t-\n" % data.rstrip())
		merge_dbtype.close()
		
		cmd = 'echo `date` begin remove '+dbtype+' temporary files...\n'
		cmd += 'rm -rf '+dbnsfp_annofile+'\n'
		cmd += 'rm -rf '+merge+'\n'
		cmd += 'echo `date` finish remove '+dbtype+' temporary files...\n'
		run(cmd)
		merge = re.sub('\.annotation','.'+dbtype+'.annotation',merge)
		
	return (merge)