#!/usr/bin/env python
# This script reads sentences from the CoNLL-2009 English *training* data,
# and Q/A annotation from our released QA-SRL data
# outputs a file with sentences and their QA annotation.
# Matching of CoNLL sentences and Q/A annotation is based on the sentence Id number.
#
# Example usage:
# 	python newswire_prepare.py /path/to/oNLL2009-ST-English-train.txt newswire_nosent.train.qa newswire.train.qa
# 	python newswire_prepare.py /path/to/oNLL2009-ST-English-train.txt newswire_nosent.dev.qa newswire.dev.qa
# 	python newswire_prepare.py /path/to/oNLL2009-ST-English-train.txt newswire_nosent.test.qa newswire.test.qa
#
# Author: Luheng He (firstname at cs dot washington dot edu)

import sys, re

def prepare(conll_path, input_path, output_path):
	conll_input = open(conll_path, 'r')
	conll_sentences = []
	tokens = []
	for line in conll_input:
		if line.strip() == "":
			conll_sentences.append([t for t in tokens])
			tokens = []
		else:
			tokens.append(line.strip().split()[1])
	conll_input.close()

	qa_input = open(input_path, 'r')
	qa_output = open(output_path, 'w')
	
	lc = 0
	sent_id = -1
	for line in qa_input:
		info = line.strip().split()
		if lc == 0:
			# Read sentence ID.
			sent_id = int(info[0].split("_")[1])
			qa_output.write(line)
		elif lc == 1:
			# Write sentence.
			qa_output.write(" ".join(conll_sentences[sent_id]) + "\n")
		else:
			# Write other QA data
			qa_output.write(line)

		# Update line counter.
		if line.strip() == "":
			lc = 0
		else:
			lc += 1


	qa_output.close()
	qa_input.close()	


if  __name__ == "__main__":
	if len(sys.argv) < 4:
		print "Usage: python newswire_prepare.py {path to trainging portion CoNLL2009 English data} {input qa data} {prepared qa data}"
	else:
		prepare(sys.argv[1], sys.argv[2], sys.argv[3])