archive/preprocess/jig2tsv.py (17 lines of code) (raw):
import xml.etree.ElementTree as ET
from nltk.tokenize import sent_tokenize
import csv
def jig2tsv(jigFile, tsvFile):
tree = ET.parse(jigFile)
root = tree.getroot()
wf = open(tsvFile, 'w')
wtr = csv.writer(wf, delimiter='\t', doublequote=False, escapechar='\\', quoting=csv.QUOTE_MINIMAL)
for document in root:
docText = document.find('docText').text.encode('ascii', 'ignore').replace('"','\'').replace('\n', '').lower()
sentences = sent_tokenize(docText)
for sentence in sentences:
if( len(sentence) > 0 ):
wtr.writerow(['', sentence, '1'])
wf.close()
if __name__ == "__main__":
jig2tsv(sys.argv[1], sys.argv[2] if len(sys.argv) > 2 else "output.tsv")