def annovar_annotation()

in src/main/python/vcf_annovar_annotation.py [0:0]


def annovar_annotation(merge,annovar_passfile,vcf2annovar_var_match,dbtype,refversion):
	if dbtype == 'multiscore':
		cmd = 'echo `date` begin annotate '+dbtype+' variants using ANNOVAR...\n'
		cmd += 'perl '+ANNOVAR_TABLE+' -buildver '+refversion+' -protocol '+db[dbtype]+' '
		cmd += annovar_passfile+' '+ANNOVAR_DB+' -operation f -nastring - \n'
		dropped = annovar_passfile+'.'+refversion+'_'+outname[dbtype]+'_dropped'
		filtered = annovar_passfile+'.'+refversion+'_'+outname[dbtype]+'_filtered'
		log = annovar_passfile+'.log'
		invalid = annovar_passfile+'.invalid_input'
		all = annovar_passfile+'.'+refversion+'_multianno.txt'
		cmd += 'echo `date` finish annotate '+dbtype+' variants using ANNOVAR...\n'
		run(cmd)
		
		annotated_variants=dict()
		anno=open(all)
		header = anno.readline()
		## SIFT                D: Deleterious (sift<=0.05); T: tolerated (sift>0.05)
		## PolyPhen 2 HDIV     D: Probably damaging (>=0.957), P: possibly damaging (0.453<=pp2_hdiv<=0.956); B: benign (pp2_hdiv<=0.452)
		## PolyPhen 2 HVar     D: Probably damaging (>=0.909), P: possibly damaging (0.447<=pp2_hdiv<=0.909); B: benign (pp2_hdiv<=0.446)
		## MutationTaster      A ("disease_causing_automatic"); "D" ("disease_causing"); "N" ("polymorphism"); "P" ("polymorphism_automatic")
		## MutationAssessor    H: high; M: medium; L: low; N: neutral. H/M means functional and L/N means non-functional
		## FATHMM              D: Deleterious; T: Tolerated
		## VEST3               higher scores are more deleterious
		## CADD raw            higher scores are more significant(>4)
		## CADD phred          20 means top 1%, 30 means top 0.1%. percentage = 10^(-CADD phred/10)
		## GERP++              higher scores are more deleterious(>2)
		## PhyloP              higher scores are more deleterious
		for data in anno:
			annotates = data.rstrip().split("\t")
			chr,start,end,ref,alt=annotates[:5]
			sift_score,sift_pred=annotates[5:7]
			poly2HDIV_score,poly2HDIV_pred=annotates[7:9]
			poly2HVAR_score,poly2HVAR_pred=annotates[9:11]
			MT_score,MT_pred=annotates[13:15]
			MA_score,MA_pred=annotates[15:17]
			FATHMM_score,FATHMM_pred=annotates[17:19]
			VEST3score=annotates[21]
			CADD_score,CADD_phred=annotates[22:24]
			gerpRS=annotates[33]
			phyloP20way_mammalian=annotates[35]
			phastCons20way_mammalian=annotates[37]
			anno_variant=chr+':'+start+'_'+end+ref+'/'+alt
			if anno_variant not in annotated_variants:
				annotated_variants[anno_variant]=[sift_score,sift_pred,poly2HDIV_score,poly2HDIV_pred,poly2HVAR_score,poly2HVAR_pred,
											 MT_score,MT_pred,MA_score,MA_pred,
											 FATHMM_score,FATHMM_pred,VEST3score,CADD_score,CADD_phred,gerpRS,
											 phyloP20way_mammalian,phastCons20way_mammalian]
		
		merge_dbtype = open(re.sub('\.annotation','.'+dbtype+'.annotation',merge),'w')
		input = open(merge)
		header = input.readline()
		data = header.rstrip().split("\t")
		for i in range(len(data)):
			if data[i]=="#CHROM":
				chrom_index = i
			elif data[i]=="POS":
				position_index = i
			elif data[i]=="REF":
				ref_index = i
			elif data[i]=="ALT":
				alt_index = i
		merge_dbtype.write("%s\tSIFT\tPolyPhen2_HDIV\tPolyPhen2_HDAR\tMutationTaster\tMutationAssessor\tFATHMM\tVEST3\tCADD_raw\tCADD_phred\tGERP++\tphyloP20way\tphastCons20way\n" % header.rstrip())
		for data in input:
			annotates = data.rstrip().split("\t")
			variant = annotates[chrom_index]+"_"+annotates[position_index]+"_"+annotates[ref_index]+"_"+annotates[alt_index]
			if variant in vcf2annovar_var_match:
				anno_variant = vcf2annovar_var_match[variant]
				if anno_variant in annotated_variants:
					sift_score,sift_pred,poly2HDIV_score,poly2HDIV_pred,poly2HVAR_score,poly2HVAR_pred,MT_score,MT_pred,MA_score,MA_pred,FATHMM_score,FATHMM_pred,VEST3score,CADD_score,CADD_phred,gerpRS,phyloP20way_mammalian,phastCons20way_mammalian = annotated_variants[anno_variant]
					merge_dbtype.write("%s\t%s(%s)\t%s(%s)\t%s(%s)\t%s(%s)\t%s(%s)\t%s(%s)\t%s\t%s\t%s\t%s\t%s\t%s\n" %
									   (data.rstrip(),sift_score,sift_pred,poly2HDIV_score,poly2HDIV_pred,poly2HVAR_score,poly2HVAR_pred,
										MT_score,MT_pred,MA_score,MA_pred,FATHMM_score,FATHMM_pred,VEST3score,CADD_score,CADD_phred,gerpRS,
										phyloP20way_mammalian,phastCons20way_mammalian))
				else:
					merge_dbtype.write("%s\t-(-)\t-(-)\t-(-)\t-(-)\t-(-)\t-(-)\t-\t-\t-\t-\t-\t-\n" % data.rstrip())
			else:
				merge_dbtype.write("%s\t-(-)\t-(-)\t-(-)\t-(-)\t-(-)\t-(-)\t-\t-\t-\t-\t-\t-\n" % data.rstrip())
		merge_dbtype.close()
		
		cmd = 'echo `date` begin remove '+dbtype+' temporary files...\n'
		cmd += 'rm -rf '+dropped+'\n'
		cmd += 'rm -rf '+filtered+'\n'
		cmd += 'rm -rf '+log+'\n'
		cmd += 'rm -rf '+invalid+'\n'
		cmd += 'rm -rf '+all+'\n'
		cmd += 'rm -rf '+merge+'\n'
		cmd += 'echo `date` finish remove '+dbtype+' temporary files...\n'
		run(cmd)
		merge = re.sub('\.annotation','.'+dbtype+'.annotation',merge)
		
	else:
		cmd = 'echo `date` begin annotate '+dbtype+' variants using ANNOVAR...\n'
		cmd += 'perl '+ANNOVAR_ANNO+' -buildver '+refversion+' -filter -dbtype '+db[dbtype]+' '
		cmd += annovar_passfile+' '+ANNOVAR_DB+'\n'
		dropped = annovar_passfile+'.'+refversion+'_'+outname[dbtype]+'_dropped'
		filtered = annovar_passfile+'.'+refversion+'_'+outname[dbtype]+'_filtered'
		log = annovar_passfile+'.log'
		invalid = annovar_passfile+'.invalid_input'
		cmd += 'echo `date` finish annotate '+dbtype+' variants using ANNOVAR...\n'
		run(cmd)
	
		annotated_variants=dict()
		for data in open(dropped):
			annotates = data.rstrip().split("\t")
			score,chr,start,end,ref,alt=annotates[1:7]
			anno_variant=chr+':'+start+'_'+end+ref+'/'+alt
			if anno_variant not in annotated_variants:
				annotated_variants[anno_variant]=score
		
		merge_dbtype = open(re.sub('\.annotation','.'+dbtype+'.annotation',merge),'w')
		input = open(merge)
		header = input.readline()
		data = header.rstrip().split("\t")
		for i in range(len(data)):
			if data[i]=="#CHROM":
				chrom_index = i
			elif data[i]=="POS":
				position_index = i
			elif data[i]=="REF":
				ref_index = i
			elif data[i]=="ALT":
				alt_index = i
		merge_dbtype.write("%s\t%s\n" % (header.rstrip(),alias[dbtype]))
		for data in input:
			annotates = data.rstrip().split("\t")
			variant = annotates[chrom_index]+"_"+annotates[position_index]+"_"+annotates[ref_index]+"_"+annotates[alt_index]
			if variant in vcf2annovar_var_match:
				anno_variant = vcf2annovar_var_match[variant]
				if anno_variant in annotated_variants:
					merge_dbtype.write("%s\t%s\n" % (data.rstrip(),annotated_variants[anno_variant]))
				else:
					merge_dbtype.write("%s\t-\n" % data.rstrip())
			else:
				merge_dbtype.write("%s\t-\n" % data.rstrip())
		merge_dbtype.close()
	
		cmd = 'echo `date` begin remove '+dbtype+' temporary files...\n'
		cmd += 'rm -rf '+dropped+'\n'
		cmd += 'rm -rf '+filtered+'\n'
		cmd += 'rm -rf '+log+'\n'
		cmd += 'rm -rf '+invalid+'\n'
		cmd += 'rm -rf '+merge+'\n'
		cmd += 'echo `date` finish remove '+dbtype+' temporary files...\n'
		run(cmd)
		merge = re.sub('\.annotation','.'+dbtype+'.annotation',merge)
		
	return (merge)