#!/usr/bin/env python # This script reads sentences from the CoNLL-2009 English *training* data, # and Q/A annotation from our released QA-SRL data # outputs a file with sentences and their QA annotation. # Matching of CoNLL sentences and Q/A annotation is based on the sentence Id number. # # Example usage: # python newswire_prepare.py /path/to/oNLL2009-ST-English-train.txt newswire_nosent.train.qa newswire.train.qa # python newswire_prepare.py /path/to/oNLL2009-ST-English-train.txt newswire_nosent.dev.qa newswire.dev.qa # python newswire_prepare.py /path/to/oNLL2009-ST-English-train.txt newswire_nosent.test.qa newswire.test.qa # # Author: Luheng He (firstname at cs dot washington dot edu) import sys, re def prepare(conll_path, input_path, output_path): conll_input = open(conll_path, 'r') conll_sentences = [] tokens = [] for line in conll_input: if line.strip() == "": conll_sentences.append([t for t in tokens]) tokens = [] else: tokens.append(line.strip().split()[1]) conll_input.close() qa_input = open(input_path, 'r') qa_output = open(output_path, 'w') lc = 0 sent_id = -1 for line in qa_input: info = line.strip().split() if lc == 0: # Read sentence ID. sent_id = int(info[0].split("_")[1]) qa_output.write(line) elif lc == 1: # Write sentence. qa_output.write(" ".join(conll_sentences[sent_id]) + "\n") else: # Write other QA data qa_output.write(line) # Update line counter. if line.strip() == "": lc = 0 else: lc += 1 qa_output.close() qa_input.close() if __name__ == "__main__": if len(sys.argv) < 4: print "Usage: python newswire_prepare.py {path to trainging portion CoNLL2009 English data} {input qa data} {prepared qa data}" else: prepare(sys.argv[1], sys.argv[2], sys.argv[3])