in archive/preprocess/jig2tsv.py [0:0]
def jig2tsv(jigFile, tsvFile):
tree = ET.parse(jigFile)
root = tree.getroot()
wf = open(tsvFile, 'w')
wtr = csv.writer(wf, delimiter='\t', doublequote=False, escapechar='\\', quoting=csv.QUOTE_MINIMAL)
for document in root:
docText = document.find('docText').text.encode('ascii', 'ignore').replace('"','\'').replace('\n', '').lower()
sentences = sent_tokenize(docText)
for sentence in sentences:
if( len(sentence) > 0 ):
wtr.writerow(['', sentence, '1'])
wf.close()