clns-eTarget_ingest/genomicdata_json.py (689 lines of code) (raw):

# # This file is part of the eTarget ingest distribution (https://github.com/digital-ECMT/eTarget_ingest). # Copyright (C) 2017 - 2021 digital ECMT # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, version 3. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. # import pymssql import json from azure.storage.file import FileService import utilities import re import datetime import traceback from jsonschema import Draft7Validator from jsonschema import ValidationError class GenomicDataException(Exception): """Raised internally and is already sent to the log file/db""" pass class GenomicDataJson: def __init__(self, filename, remotehostname, remoteusername, remotepassword, remotedbname, fileuser, filekey, datadir='data', logblob='log'): self.filename = filename self.datadir=datadir self.file_service = FileService(account_name=fileuser, account_key=filekey) self.genomicDataString = self.file_service.get_file_to_text(self.datadir, None, filename).content self.remotehostname = remotehostname self.remoteusername = remoteusername self.remotepassword = remotepassword self.remotedbname = remotedbname self.conn = pymssql.connect(self.remotehostname,self.remoteusername, self.remotepassword, self.remotedbname, autocommit=False) self.log = utilities.Util(remotehostname, remoteusername, remotepassword, remotedbname, fileuser, filekey, logblob) self.errorflag = False self.config = self.log.getConfig() with open('genomicdata_schema.json', 'r') as f: schema_data = f.read() self.schema=Draft7Validator(json.loads(schema_data)) def __del__(self): self.conn.close() def deleteFile(self): try: if self.file_service.exists(self.datadir, None, self.filename): self.file_service.delete_file(self.datadir, None, self.filename) if self.errorflag==False: self.log.systemStatusUpdate(filename, 'Genomic', self.log.timestamp(), 'Success') except: self.log.logMessage('There was a problem deleting '+self.filename) def rollback(self): self.conn.rollback() self.conn.close() def ingest(self): try: self.genomicalDataAll = json.loads(self.genomicDataString) self.schema.validate(self.genomicalDataAll) if not 'type' in self.genomicalDataAll: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: JSON file type not recognised') self.log.logMessage('Error: JSON file type not recognised') self.errorflag=True return -1 if self.genomicalDataAll['type'].lower().strip()!='genomic': self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: JSON file type must be genomic') self.log.logMessage('Error: JSON file type must be genomic') self.errorflag=True return -1 specimen_id, baseline, tar_id, source_txt = self.getSpecimen() companyOK=self.checkCompany() if not companyOK: self.log.logMessage(self.filename + ' could not find source in DB') self.log.systemStatusUpdate(self.filename, 'FM', self.log.timestamp(), 'Error: the source is not present in eTARGET') raise Exception('Ingest of '+self.filename+' failed -- source not found') if(tar_id is None): siteOK=self.checkSite() if not siteOK: self.log.logMessage(self.filename + ' could not find site in DB') self.log.systemStatusUpdate(self.filename, 'FM', self.log.timestamp(), 'Error: the site is not present in eTARGET') raise Exception('Ingest of '+self.filename+' failed -- site not found') self.log.logMessage(self.filename + ' could not find patient') self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: Patient not found') raise Exception('Ingest of '+self.filename+' failed -- patient not found') if(specimen_id is None): self.log.logMessage(self.filename + ' could not find specimen_id') self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: Specimen not found') raise Exception('Ingest of '+self.filename+' failed -- specimen not found') if(source_txt is None): self.log.logMessage(self.filename + ' could not find source chars') self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: Source chars not found') raise Exception('Ingest of '+self.filename+' failed -- source not found') resubmission,measurement_gene_panel_id=self.checkResubmission(specimen_id, baseline, tar_id, source_txt) # resubmission=0: no previous data -- new ingest # resubmission=1: it is resubmission and old data can be deleted # resubmission=2: it is resubmission and data needs to be new version updateSelection=False if(resubmission==2): updateSelection=True if(resubmission==1): self.deleteGenomicData(measurement_gene_panel_id) measrumentgenepanelid = self.insertMeasurementGenePanel(specimen_id, baseline, source_txt) self.parseAlterations(measrumentgenepanelid, tar_id, updateSelection, measurement_gene_panel_id) if self.errorflag==False: # no problems found self.conn.commit() self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Success') self.log.logMessage(self.filename + ' successfully ingested') return 0 else: self.rollback() self.log.logMessage(self.filename + ' problems with ingestion') return -1 except ValidationError as e: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: JSON format invalid; check: ' + ' -> '.join(str(x) for x in e.path).replace("'", "''") +"; "+ e.message.replace("'", "''")) self.log.logMessage(self.filename + ' json format wrong' + str(e) ) self.log.logMessage(traceback.format_exc()) return -1 except GenomicDataException as e: self.log.logMessage("abort ingest because of previous errors") self.log.logMessage(traceback.format_exc()) return -1 except Exception as e: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: JSON format cannot be read; ' + str(e).replace("'", "''")) self.log.logMessage(self.filename + ' json format wrong' + str(e) ) self.log.logMessage(traceback.format_exc()) return -1 def checkSite(self): specimentext = self.genomicalDataAll['specimenId'].strip() site=specimentext[3:6] siteselect = "SELECT * from CARE_SITE where care_site_id= "+site cursor = self.conn.cursor() cursor.execute(siteselect) row = cursor.fetchone() cursor.close() if row is not None: return True else: return False def checkCompany(self): specimentext = self.genomicalDataAll['specimenId'].strip() p2 = re.compile('\D{2}') source_txt=p2.findall(specimentext[12:])[0] select_source_code = "SELECT data_source_concept_id from CONCEPT_DATA_SOURCES where short_code = '"+source_txt+"'" cursor = self.conn.cursor() cursor.execute(select_source_code) row = cursor.fetchone() cursor.close() if row is not None: return True else: return False def checkResubmission(self,specimen_id, baseline, tar_id, source_txt): checkResubmissionSQL="select measurement_gene_panel_id,ingestion_date from MEASUREMENT_GENE_PANEL as mgp LEFT JOIN SPECIMEN s on mgp.specimen_id=s.specimen_id LEFT JOIN CONCEPT_DATA_SOURCES gp on gp.data_source_concept_id = mgp.data_source_concept_id where s.specimen_id="+str(specimen_id)+" and mgp.baseline_number="+str(baseline)+" and gp.short_code='"+source_txt+"' order by measurement_gene_panel_id desc" cursor = self.conn.cursor() cursor.execute(checkResubmissionSQL) row=cursor.fetchone() if row is None: return 0,0 measurement_gene_panel_id=row[0] ingestion_date=row[1] lastReportSQL = "SELECT created_on FROM MEETING_OUTCOME as mo LEFT JOIN PERSON as p on p.person_id=mo.person_id where p.target_id='"+tar_id+"' ORDER BY created_on DESC;" cursor = self.conn.cursor() cursor.execute(lastReportSQL) row=cursor.fetchone() if row is not None: lastDiscussedDate = row[0] if lastDiscussedDate>=ingestion_date: # new version required; discussed after previous ingestion return 2,measurement_gene_panel_id else: # discussed but not this report return 1,measurement_gene_panel_id #never discussed return 1, measurement_gene_panel_id def deleteGenomicData(self, measurement_gene_panel_id): deleteSelectionSQL ="DELETE from SELECTED_GENE_VARIANT where SELECTED_GENE_VARIANT.measurement_gene_variant_id in \ (select SELECTED_GENE_VARIANT.measurement_gene_variant_id from SELECTED_GENE_VARIANT \ LEFT JOIN MEASUREMENT_GENE_VARIANT on MEASUREMENT_GENE_VARIANT.measurement_gene_variant_id=SELECTED_GENE_VARIANT.measurement_gene_variant_id \ where measurement_gene_panel_id="+str(measurement_gene_panel_id)+")" deleteVariantsSQL="DELETE from MEASUREMENT_GENE_VARIANT where measurement_gene_panel_id="+str(measurement_gene_panel_id) deletePanelSQL="DELETE from MEASUREMENT_GENE_PANEL where measurement_gene_panel_id="+str(measurement_gene_panel_id) cursor = self.conn.cursor() cursor.execute(deleteSelectionSQL) cursor.execute(deleteVariantsSQL) cursor.execute(deletePanelSQL) def getSpecimen(self): if not 'specimenId' in self.genomicalDataAll or len(self.genomicalDataAll['specimenId'])==0: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: Data import failed (specimenId not present)') self.log.logMessage('Error specimenId not present') self.errorflag=True raise GenomicDataException('Error specimenId not present') specimenId=self.genomicalDataAll['specimenId'].strip() if not 'sampleType' in self.genomicalDataAll or len(self.genomicalDataAll['sampleType'])==0: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: Data import failed (sampleType not present)') self.log.logMessage('Error sampleType not present') self.errorflag=True raise GenomicDataException('Error sampleType not present') test_type=self.genomicalDataAll['sampleType'] tarid=specimenId[0:10] pattern = re.compile("^[A-Z]{3}\d{7}\D") if not re.match(pattern, specimenId): self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: specimen ID is incompatible with the naming convention') self.log.logMessage('Error ingesting Genomic data - patient ID has the wrong structure') self.errorflag=True raise GenomicDataException('Error person id does not match pattern') timepoint=specimenId[10:] pattern = re.compile("^T\d{1,2}(\D|$)") pattern2 = re.compile("^Bx\d{1,2}(\D|$)") if not re.match(pattern, timepoint) and not re.match(pattern2, timepoint): self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: The time point is incompatible with the naming convention') self.log.logMessage('Error ingesting Genomic data - time point has the wrong structure') raise GenomicDataException('Error timepoint does not match pattern') pattern = re.compile("^T\d{1,2}\D{2}") pattern2 = re.compile("^Bx\d{1,2}\D{2}") if not re.match(pattern, timepoint) and not re.match(pattern2, timepoint): self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: Specimen ID is incompatible with the naming convention') self.log.logMessage('Error ingesting Genomic data - 2 char source not found') raise GenomicDataException('Error source chars does not match pattern') p2 = re.compile('\D{2}') source_txt=p2.findall(specimenId[12:])[0] patientselect = "SELECT person_id from PERSON where target_id= '"+tarid+"'" print('tarid ', tarid) cursor = self.conn.cursor() cursor.execute(patientselect) row = cursor.fetchone() if row is None: cursor.close() return None, None, None, source_txt if 'blood' == test_type.lower(): p = re.compile('T\d+') baseline_v=p.findall(specimenId[10:]) if len(baseline_v)==0: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: specimenId and sampleType are not compatible') self.log.logMessage('Error ingesting Genomic data - speicmenID is not of correct structure for blood') raise GenomicDataException('Error specimenID is not of correct structure for blood') baseline_id=baseline_v[0][1:] print('baseline id =' + str(baseline_id)); specimenselect = "SELECT SPECIMEN.specimen_id, SPECIMEN.baseline_number from PERSON LEFT JOIN SPECIMEN on PERSON.person_id=SPECIMEN.person_id where PERSON.target_id='"+tarid+"' and SPECIMEN.specimen_concept_id=1 and SPECIMEN.baseline_number="+str(baseline_id)+"" print(specimenselect) cursor.execute(specimenselect) row = cursor.fetchone() if row is not None: cursor.close() return row[0], row[1], tarid, source_txt else: cursor.close() return None, None, tarid, source_txt elif 'tissue' == test_type.lower(): p = re.compile('Bx\d+') baseline_v=p.findall(specimenId[10:]) if len(baseline_v)==0: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: specimenId and sampleType are not compatible') self.log.logMessage('Error ingesting Genomic data - speicmenID is not of correct structure for tissue') raise GenomicDataException('Error specimenID is not of correct structure for tissue') baseline_id=baseline_v[0][2:] print('baseline id =' + str(baseline_id)); specimenselect = "SELECT SPECIMEN.specimen_id, SPECIMEN.baseline_number from PERSON LEFT JOIN SPECIMEN on PERSON.person_id=SPECIMEN.person_id where PERSON.target_id='"+tarid+"' and SPECIMEN.specimen_concept_id!=1 and SPECIMEN.baseline_number='"+str(baseline_id)+"'" cursor.execute(specimenselect) row = cursor.fetchone() if row is not None: cursor.close() return row[0], row[1], tarid, source_txt else: cursor.close() return None, None, tarid, source_txt else: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: sampleType must be either "blood" or "tissue"') self.log.logMessage('Error ingesting Genomic data - sampleType is not a recognised string') raise GenomicDataException('Error sampleType is not a recognised string') def insertMeasurementGenePanel(self, specimen_id, baseline, source_txt): # insert new measuremnt gene panel with empty set of data and gene panel name of 'foundationmedicine' try: tumourFractionScore= None tumourFractionUnit = None mean_exon_depth = None percentExons100x = None if 'sample' in self.genomicalDataAll: sample = self.genomicalDataAll['sample'] if 'tumourFractionScore' in sample: if 'value' in sample['tumourFractionScore']: tumourFractionScore=sample['tumourFractionScore']['value'] if not isinstance(tumourFractionScore, (float, int)): self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: tumourFractionScore requires a value field') self.log.logMessage(self.filename + ' value in tumourFractionScore must be number') self.errorflag=True raise GenomicDataException("value in tumourFractionScore must be a number") tumourFractionScore=str(tumourFractionScore) else: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: tumourFractionScore requires a value') self.log.logMessage(self.filename + ' tumourFractionScore requires a value') self.errorflag=True raise GenomicDataException("missing value in tumourFractionScore") if 'unit' in sample['tumourFractionScore']: tumourFractionUnit=sample['tumourFractionScore']['unit'] if tumourFractionUnit is not None: tumourFractionUnit=tumourFractionUnit.strip() if 'meanExonDepth' in sample: mean_exon_depth = str(sample['meanExonDepth']) print('median exon depth ' + str(mean_exon_depth) ) if 'percentExons100x' in sample: percentExons100x = str(sample['percentExons100x']) print('percentExons100x ' + percentExons100x) mistatus=None minscore=None tmbscore=None tmbstatus=None tmbunit=None if 'bioMarkers' in self.genomicalDataAll: biomarkers = self.genomicalDataAll['bioMarkers'] if not isinstance(biomarkers, list): self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: "bioMarkers" must be a list') self.log.logMessage(self.filename + ' biomarkers must be of type list') self.errorflag=True raise GenomicDataException("value in bioMarker must be a list") for biomarker in biomarkers: if not 'type' in biomarker: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: bioMarkers requires a type field') self.log.logMessage(self.filename + ' bioMarkers requires a type field') self.errorflag=True raise GenomicDataException("bioMarkers requires a type field") if biomarker['type'].lower()=='microsatellite-instability': if not 'status' in biomarker: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: microsatellite-instability requires a status field') self.log.logMessage(self.filename + ' microsatellite-instability requires a status field') self.errorflag=True raise GenomicDataException("microsatellite-instability requires a status field") mistatus=biomarker['status'] if len(mistatus.strip())==0: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: status field needs a value in microsatellite-instability') self.log.logMessage(self.filename + ' status field needs a value in microsatellite-instability') self.errorflag=True raise GenomicDataException("status field needs a value in microsatellite-instability") if 'score' in biomarker: minscore=biomarker['score'] print(mistatus) elif biomarker['type'].lower()=='tumour mutation burden': if not 'status' in biomarker: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: tumour mutation burden requires a status field') self.log.logMessage(self.filename + ' tumour mutation burden requires a status field') self.errorflag=True raise GenomicDataException("tumour mutation burden requires a status field") tmbstatus=biomarker['status'] if len(tmbstatus.strip())==0: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: status field needs a value in tumour mutation burden') self.log.logMessage(self.filename + ' status field needs a value in tumour mutation burden') self.errorflag=True raise GenomicDataException("status field needs a value in tumour mutation burden") if 'score' in biomarker: if not 'value' in biomarker['score']: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: tumour mutation burden requires a score value') self.log.logMessage(self.filename + ' tumour mutation burden requires a score value') self.errorflag=True raise GenomicDataException("tumour mutation burden requires a score value") tmbscore=str(biomarker['score']['value']) if 'unit' in biomarker['score']: tmbunit=biomarker['score']['unit'] else: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: bioMarker type not supported') self.log.logMessage(self.filename + ' bioMarker type not supported') self.errorflag=True raise GenomicDataException("bioMarker type not supported") select_gp_concept_id = "SELECT data_source_concept_id from CONCEPT_DATA_SOURCES where short_code = '"+source_txt+"'" cursor = self.conn.cursor() cursor.execute(select_gp_concept_id) row = cursor.fetchone() cursor.close() if row is not None: data_source_concept_id=row[0] else: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: data source not registered') self.log.logMessage(self.filename + ' data source not registered ' + source_txt) self.errorflag=True raise GenomicDataException("data source not found") addFMsql = "INSERT INTO MEASUREMENT_GENE_PANEL (specimen_id, data_source_concept_id,average_read_depth, ngs_run, baseline_number, microsatellite_instability_status, tmb_score, tmb_status, tmb_unit, mean_exon_depth, percent_exons_100x, tumour_fraction_score, tumour_fraction_unit) VALUES(%s, %s, 'n/a', 'FM', %s, %s, %s, %s, %s, %s, %s, %s, %s)" cursor = self.conn.cursor() cursor.execute(addFMsql, (specimen_id, data_source_concept_id, baseline, mistatus, tmbscore, tmbstatus, tmbunit, mean_exon_depth, percentExons100x, tumourFractionScore, tumourFractionUnit)) measurement_gene_panel_id = cursor.lastrowid print('measurement gene panel id ' + str(measurement_gene_panel_id)) return measurement_gene_panel_id except Exception as e: self.log.logMessage('cannot get all biomarker values') self.log.logMessage(str(e)) self.errorflag=True print(e) raise e def parseAlterations(self, measrumentgenepanelid, tar_id, updateSelection, prev_gene_panel_id): print('parseAlterations') if 'alterations' in self.genomicalDataAll: for alteration in self.genomicalDataAll['alterations']: if not 'type' in alteration: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: all alterations require a type field') self.log.logMessage(self.filename + ' all alterations require a type field') self.errorflag=True raise GenomicDataException("all alterations require a type field") if not isinstance(alteration['type'], str): self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: alterations type field must be string') self.log.logMessage(self.filename + ' alterations type field must be string') self.errorflag=True raise GenomicDataException("alterations type field must be string") if alteration['type'].lower()=='short variant': self.parseShortVariant(alteration, measrumentgenepanelid, tar_id, updateSelection, prev_gene_panel_id) elif alteration['type'].lower()=='copy number alteration': self.parseCNA(alteration, measrumentgenepanelid, tar_id, updateSelection, prev_gene_panel_id) elif alteration['type'].lower()=='rearrangement': self.parseRearrangement(alteration, measrumentgenepanelid, tar_id, updateSelection, prev_gene_panel_id) else: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: alterations type not supported') self.log.logMessage(self.filename + ' alterations type not supported') self.errorflag=True raise GenomicDataException("alterations type not supported") def parseShortVariant(self, alteration, measrumentgenepanelid, tar_id, updateSelection, prev_gene_panel_id): #parse the sections with short variants and add to db print('parseShortVariant') allelefraction = None cdseffect = None depth = None gene = None position = None proteineffect = None status = None transcript = None classification = None subclonal = None if not 'gene' in alteration: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: gene field is required in short variant alterations') self.log.logMessage(self.filename + ' gene field is required in short variant alterations') self.errorflag=True raise GenomicDataException("gene field is required in short variant alterations") gene=alteration['gene'] if len(gene.strip())==0: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: gene field needs a value in short variant alterations') self.log.logMessage(self.filename + ' gene field needs a value in short variant alterations') self.errorflag=True raise GenomicDataException("gene field needs a value in short variant alterations") if not 'proteinChange' in alteration: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: proteinChange field is required in short variant alterations') self.log.logMessage(self.filename + ' proteinChange field is required in short variant alterations') self.errorflag=True raise GenomicDataException("proteinChange field is required in short variant alterations") proteineffect=alteration['proteinChange'] if len(proteineffect.strip())==0: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: proteinChange field needs a value in short variant alterations') self.log.logMessage(self.filename + ' proteinChange field needs a value in short variant alterations') self.errorflag=True raise GenomicDataException("proteinChange field needs a value in short variant alterations") if not 'cdsChange' in alteration: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: cdsChange field is required in short variant alterations') self.log.logMessage(self.filename + ' cdsChange field is required in short variant alterations') self.errorflag=True raise GenomicDataException("cdsChange field is required in short variant alterations") cdseffect=alteration['cdsChange'] if len(cdseffect.strip())==0: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: cdsChange field needs a value in short variant alterations') self.log.logMessage(self.filename + ' cdsChange field needs a value in short variant alterations') self.errorflag=True raise GenomicDataException("cdsChange field needs a value in short variant alterations") if not 'readDepth' in alteration: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: readDepth field is required in short variant alterations') self.log.logMessage(self.filename + ' readDepth field is required in short variant alterations') self.errorflag=True raise GenomicDataException("readDepth field is required in short variant alterations") depth=alteration['readDepth'] if not 'position' in alteration: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: position field is required in short variant alterations') self.log.logMessage(self.filename + ' position field is required in short variant alterations') self.errorflag=True raise GenomicDataException("position field is required in short variant alterations") position = alteration['position'] if len(position.strip())==0: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: position field needs a value in short variant alterations') self.log.logMessage(self.filename + ' position field needs a value in short variant alterations') self.errorflag=True raise GenomicDataException("position field needs a value in short variant alterations") if 'status' in alteration: status=alteration['status'] if 'transcript' in alteration: transcript=alteration['transcript'] if not 'variantAlleleFrequency%' in alteration: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: variantAlleleFrequency% field is required in short variant alterations') self.log.logMessage(self.filename + ' variantAlleleFrequency% field is required in short variant alterations') self.errorflag=True raise GenomicDataException("variantAlleleFrequency% field is required in short variant alterations") allelefraction=alteration['variantAlleleFrequency%'] if not 'functionalEffect' in alteration: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: functionalEffect field is required in short variant alterations') self.log.logMessage(self.filename + ' functionalEffect field is required in short variant alterations') self.errorflag=True raise GenomicDataException("functionalEffect field is required in short variant alterations") classification=alteration['functionalEffect'] if len(classification.strip())==0: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: functionalEffect field needs a value in short variant alterations') self.log.logMessage(self.filename + ' functionalEffect field needs a value in short variant alterations') self.errorflag=True raise GenomicDataException("functionalEffect field needs a value in short variant alterations") if 'subclonal' in alteration: subclonal=alteration['subclonal'] if not isinstance(subclonal, bool): self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: subclonal field needs to be boolean in short variant alterations') self.log.logMessage(self.filename + ' subclonal field needs to be boolean in short variant alterations') self.errorflag=True raise GenomicDataException("subclonal field needs to be boolean in short variant alterations") subclonal=str(subclonal) print(gene + ' ' + position + ' ' + cdseffect) gene_id = self.getGene(gene) #insert into measurement_gene_variant addShortVariant = "INSERT INTO MEASUREMENT_GENE_VARIANT (measurement_gene_panel_id, gene_concept_id, cdna_change, read_depth, variant_allele_frequency, position, amino_acid_change, transcript, status, variant_type, functional_effect, subclonal) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" cursor = self.conn.cursor() cursor.execute(addShortVariant,(str(measrumentgenepanelid), str(gene_id), cdseffect, depth, allelefraction, position, proteineffect, transcript, status, 'short_variant', classification, subclonal)) rowid=cursor.lastrowid if updateSelection: #check whether was ticked before findPreviouseSelectionQuery="select type from SELECTED_GENE_VARIANT FULL OUTER JOIN MEASUREMENT_GENE_VARIANT on \ MEASUREMENT_GENE_VARIANT.measurement_gene_variant_id=SELECTED_GENE_VARIANT.measurement_gene_variant_id \ FULL OUTER JOIN MEASUREMENT_GENE_PANEL on MEASUREMENT_GENE_PANEL.measurement_gene_panel_id=MEASUREMENT_GENE_VARIANT.measurement_gene_panel_id \ where person_id=(SELECT person_id FROM PERSON WHERE person.target_id = '"+tar_id+"') and \ MEASUREMENT_GENE_PANEL.measurement_gene_panel_id=" +str(prev_gene_panel_id)+ " and gene_concept_id=(SELECT TOP 1 gene_concept_id FROM CONCEPT_GENE \ WHERE gene_name = '"+gene+"') and amino_acid_change = '"+str(proteineffect)+"' and cdna_change ='"+str(cdseffect)+"';" findPreviousSelectionCursor = self.conn.cursor() findPreviousSelectionCursor.execute(findPreviouseSelectionQuery) row=findPreviousSelectionCursor.fetchone() findPreviousSelectionCursor.close() if row is not None: #was previously selected insertSelectionQuery="insert into SELECTED_GENE_VARIANT (person_id, measurement_gene_variant_id, type) values((SELECT person_id FROM PERSON WHERE person.target_id = '"+tar_id+"'),"+str(rowid)+",'"+row[0]+"');" insertSelectionCursor = self.conn.cursor() insertSelectionCursor.execute(insertSelectionQuery) insertSelectionCursor.close() return def parseCNA(self, alteration, measrumentgenepanelid, tar_id, updateSelection, prev_gene_panel_id): #parse the sections with copy number alteration and add to db print('parseCNA') copynumber = None gene = None numberofexons=None position = None ratio = None status = None type = None equivocal = None if not 'copyNumber' in alteration: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: copyNumber field is required in CNA') self.log.logMessage(self.filename + ' copyNumber field is required in CNA') self.errorflag=True raise GenomicDataException("copyNumber field is required in CNA") copynumber=alteration['copyNumber'] if not 'gene' in alteration: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: gene field is required in CNA') self.log.logMessage(self.filename + ' gene field is required in CNA') self.errorflag=True raise GenomicDataException("gene field is required in CNA") gene=alteration['gene'] if len(gene.strip())==0: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: gene field needs a value in CNA') self.log.logMessage(self.filename + ' gene field needs a value in CNA') self.errorflag=True raise GenomicDataException("gene field needs a value in CNA") gene_id = self.getGene(gene) if not 'description' in alteration: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: description field is required in CNA') self.log.logMessage(self.filename + ' description field is required in CNA') self.errorflag=True raise GenomicDataException("description field is required in CNA") type=alteration['description'] if len(type.strip())==0: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: description field needs a value in CNA') self.log.logMessage(self.filename + ' description field needs a value in CNA') self.errorflag=True raise GenomicDataException("description field needs a value in CNA") if 'exons' in alteration: numberofexons=alteration['exons'] if 'position' in alteration: position=alteration['position'] if 'ratio' in alteration: ratio=alteration['ratio'] if 'status' in alteration: status=alteration['status'] if 'equivocal' in alteration: equivocal=alteration['equivocal'] if not isinstance(equivocal, bool): self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: equivocal field needs to be boolean in CNA') self.log.logMessage(self.filename + ' equivocal field needs to be boolean in CNA') self.errorflag=True raise GenomicDataException("equivocal field needs to be boolean in CNA") equivocal=str(equivocal) addCopyNumberAlteration = "INSERT INTO MEASUREMENT_GENE_VARIANT (measurement_gene_panel_id, gene_concept_id, copy_number, exons, position, cna_ratio, status, cna_type, variant_type, equivocal) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" cursor = self.conn.cursor() cursor.execute(addCopyNumberAlteration,(str(measrumentgenepanelid), str(gene_id), copynumber, numberofexons, position, ratio, status, type, 'copy_number_alteration', equivocal)) rowid=cursor.lastrowid if updateSelection: #check whether was ticked before findPreviouseSelectionQuery="select type from SELECTED_GENE_VARIANT FULL OUTER JOIN MEASUREMENT_GENE_VARIANT on \ MEASUREMENT_GENE_VARIANT.measurement_gene_variant_id=SELECTED_GENE_VARIANT.measurement_gene_variant_id \ FULL OUTER JOIN MEASUREMENT_GENE_PANEL on MEASUREMENT_GENE_PANEL.measurement_gene_panel_id=MEASUREMENT_GENE_VARIANT.measurement_gene_panel_id \ where person_id=(SELECT person_id FROM PERSON WHERE person.target_id = '"+tar_id+"') and \ MEASUREMENT_GENE_PANEL.measurement_gene_panel_id=" +str(prev_gene_panel_id)+ " and gene_concept_id=(SELECT TOP 1 gene_concept_id FROM CONCEPT_GENE \ WHERE gene_name = '"+gene+"') and copy_number = "+str(copynumber)+" and exons ='"+str(numberofexons)+"' and position='"+str(position)+"' and cna_ratio="+str(ratio)+" and status='"+str(status)+"' and cna_type='"+str(type)+"';" # print(findPreviouseSelectionQuery) findPreviousSelectionCursor = self.conn.cursor() findPreviousSelectionCursor.execute(findPreviouseSelectionQuery) row=findPreviousSelectionCursor.fetchone() # print(row) findPreviousSelectionCursor.close() if row is not None: #was previously selected insertSelectionQuery="insert into SELECTED_GENE_VARIANT (person_id, measurement_gene_variant_id, type) values((SELECT person_id FROM PERSON WHERE person.target_id = '"+tar_id+"'),"+str(rowid)+",'"+row[0]+"');" # print("---------\n",insertSelectionQuery) insertSelectionCursor = self.conn.cursor() insertSelectionCursor.execute(insertSelectionQuery) def parseRearrangement(self, alteration, measrumentgenepanelid, tar_id, updateSelection, prev_gene_panel_id): #parse the sections with rearrangement and add to db print('parseRearrangement') description = None inframe = None pos1 = None pos2 = None status = None supportingreadpairs = None type = None allelefraction = None rtype=None if not 'gene1' in alteration: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: gene1 field is required in rearrangements') self.log.logMessage(self.filename + ' gene1 field is required in rearrangements') self.errorflag=True raise GenomicDataException("gene1 field is required in rearrangements") if len(alteration['gene1'].strip())==0: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: gene1 field needs a value in rearrangements') self.log.logMessage(self.filename + ' gene1 field needs a value in rearrangements') self.errorflag=True raise GenomicDataException("gene1 field needs a value in rearrangements") gene1_id = self.getGene(alteration['gene1']) if not 'gene2' in alteration: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: gene2 field is required in rearrangements') self.log.logMessage(self.filename + ' gene2 field is required in rearrangements') self.errorflag=True raise GenomicDataException("gene2 field is required in rearrangements") if len(alteration['gene2'].strip())==0: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: gene2 field needs a value in rearrangements') self.log.logMessage(self.filename + ' gene2 field needs a value in rearrangements') self.errorflag=True raise GenomicDataException("gene2 field needs a value in rearrangements") gene2_id = self.getGene(alteration['gene2']) if not 'rearrangementType' in alteration: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: rearrangementType field is required in rearrangements') self.log.logMessage(self.filename + ' rearrangementType field is required in rearrangements') self.errorflag=True raise GenomicDataException("rearrangementType field is required in rearrangements") rtype = alteration['rearrangementType'] if len(rtype.strip())==0: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: rearrangementType field needs a value in rearrangements') self.log.logMessage(self.filename + ' rearrangementType field needs a value in rearrangements') self.errorflag=True raise GenomicDataException("rearrangementType field needs a value in rearrangements") if not 'position1' in alteration: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: position1 field is required in rearrangements') self.log.logMessage(self.filename + ' position1 field is required in rearrangements') self.errorflag=True raise GenomicDataException("position1 field is required in rearrangements") pos1 = alteration['position1'] if len(pos1.strip())==0: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: position1 field needs a value in rearrangements') self.log.logMessage(self.filename + ' position1 field needs a value in rearrangements') self.errorflag=True raise GenomicDataException("position1 field needs a value in rearrangements") if not 'position2' in alteration: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: position2 field is required in rearrangements') self.log.logMessage(self.filename + ' position2 field is required in rearrangements') self.errorflag=True raise GenomicDataException("position2 field is required in rearrangements") pos2 = alteration['position2'] if len(pos2.strip())==0: self.log.systemStatusUpdate(self.filename, 'Genomic', self.log.timestamp(), 'Error: position2 field needs a value in rearrangements') self.log.logMessage(self.filename + ' position2 field needs a value in rearrangements') self.errorflag=True raise GenomicDataException("position2 field needs a value in rearrangements") if 'status' in alteration: status=alteration['status'] if 'variantAlleleFrequency' in alteration: allelefraction=alteration['variantAlleleFrequency'] if 'inFrame' in alteration: inframe = alteration['inFrame'] if 'supportingReadPairs' in alteration: supportingreadpairs = alteration['supportingReadPairs'] if 'description' in alteration: description = alteration['description'] #insert into measurement_gene_variant addRearrangement = "INSERT INTO MEASUREMENT_GENE_VARIANT (measurement_gene_panel_id, rearr_description, rearr_in_frame, rearr_gene_2, \ rearr_pos1, rearr_pos2, status, rearr_number_of_reads, rearr_gene_1, variant_type, variant_allele_frequency, rearr_type) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" cursor = self.conn.cursor() cursor.execute(addRearrangement, (str(measrumentgenepanelid), description, inframe, str(gene2_id), pos1, pos2, status, supportingreadpairs, str(gene1_id),'rearrangement', allelefraction, rtype)) rowid=cursor.lastrowid if updateSelection: #check whether was ticked before findPreviouseSelectionQuery="select type from SELECTED_GENE_VARIANT FULL OUTER JOIN MEASUREMENT_GENE_VARIANT on \ MEASUREMENT_GENE_VARIANT.measurement_gene_variant_id=SELECTED_GENE_VARIANT.measurement_gene_variant_id \ FULL OUTER JOIN MEASUREMENT_GENE_PANEL on MEASUREMENT_GENE_PANEL.measurement_gene_panel_id=MEASUREMENT_GENE_VARIANT.measurement_gene_panel_id \ where person_id=(SELECT person_id FROM PERSON WHERE person.target_id = '"+tar_id+"') and \ MEASUREMENT_GENE_PANEL.measurement_gene_panel_id=" +str(prev_gene_panel_id)+ " and \ rearr_gene_1="+str(gene1_id)+" and rearr_description = '"+str(description)+"' and rearr_in_frame ='"+str(inframe)+"' and rearr_gene_2="+str(gene2_id)+" \ and rearr_pos1='"+str(pos1)+"' and rearr_pos2='"+str(pos2)+"' and status='"+str(status)+"' and rearr_number_of_reads="+str(supportingreadpairs)+";" # print(findPreviouseSelectionQuery) findPreviousSelectionCursor = self.conn.cursor() findPreviousSelectionCursor.execute(findPreviouseSelectionQuery) row=findPreviousSelectionCursor.fetchone() # print(row) findPreviousSelectionCursor.close() if row is not None: #was previously selected insertSelectionQuery="insert into SELECTED_GENE_VARIANT (person_id, measurement_gene_variant_id, type) values((SELECT person_id FROM PERSON WHERE person.target_id = '"+tar_id+"'),"+str(rowid)+",'"+row[0]+"');" # print("---------\n",insertSelectionQuery) insertSelectionCursor = self.conn.cursor() insertSelectionCursor.execute(insertSelectionQuery) insertSelectionCursor.close() def getGene(self, gene): genesql = "SELECT gene_concept_id FROM CONCEPT_GENE WHERE gene_name = '"+str(gene)+"'" cursor = self.conn.cursor() cursor.execute(genesql) row = cursor.fetchone() if row is not None: print(row[0]) return row[0] insertgene = "INSERT INTO CONCEPT_GENE (gene_name) VALUES(%s)" cursor = self.conn.cursor() cursor.execute(insertgene,(str(gene))) gene_id = cursor.lastrowid print(str(gene_id)) return gene_id