from collatex.core_classes import WordPunctuationTokenizer __author__ = 'ronalddekker' # The goal of this file is to construct a token stream from a single act from a single witness # imports from collatex import * from collatex.suffix_based_scorer import Scorer from integration.xml_tokenization import tokenize_xml_file from integration.xml_tokenization import tokenize_plain_text_file # main routine # open the source file tei_file_1823_act1 = open("/Users/ronalddekker/Desktop/CollateX/Elisa Files/1823_act_1.xml") # parse the file, XML event after XML event token_stream_witness_1 = [] for token_string in tokenize_xml_file(tei_file_1823_act1): # print(">" + str(token_string) + "< ") token_data = {} token_data["t"]=token_string # token_data["n"]=token_string.lower() token_stream_witness_1.append(token_data) print(token_stream_witness_1) # open the source file tei_file_second_edition_act1 = open("/Users/ronalddekker/Desktop/CollateX/Elisa Files/second_edition_act_1.xml") # parse the file, XML event after XML event token_stream_witness_2 = [] for token_string in tokenize_xml_file(tei_file_second_edition_act1): # print(">" + str(token_string) + "< ") token_data = {} token_data["t"]=token_string # token_data["n"]=lowercase(token_string) token_stream_witness_2.append(token_data) print(token_stream_witness_2) # open the third file plain_text_third_witness = open("/Users/ronalddekker/Desktop/CollateX/Elisa Files/Act1_version3.txt") token_stream_witness_3 = [] for token_string in tokenize_plain_text_file(plain_text_third_witness): # print(">" + str(token_string) + "< ") token_data = {} token_data["t"]=token_string # token_data["n"]=lowercase(token_string) token_stream_witness_3.append(token_data) print(token_stream_witness_3) # create JSON block witness_data_1 = {} witness_data_2 = {} witness_data_3 = {} witness_data_1["id"]='1' witness_data_2["id"]="2" witness_data_3["id"]="3" witness_data_1["tokens"] = token_stream_witness_1[0:500] witness_data_2["tokens"] = token_stream_witness_2[0:500] witness_data_3["tokens"] = token_stream_witness_3[0:500] pretokenized_json = {} pretokenized_json["witnesses"] = [witness_data_1, witness_data_2, witness_data_3] print(len(token_stream_witness_1)) print(len(token_stream_witness_2)) print(len(token_stream_witness_3)) print(len(witness_data_1["tokens"])) print(len(witness_data_2["tokens"])) print(len(witness_data_3["tokens"])) # # debug # collation = Collation() # for witness in pretokenized_json["witnesses"]: # collation.add_witness(witness) # algorithm = Scorer(collation) # block_witness1 = algorithm._get_block_witness(collation.witnesses[0]) # block_witness2 = algorithm._get_block_witness(collation.witnesses[1]) # block_witness3 = algorithm._get_block_witness(collation.witnesses[2]) # print(block_witness1.debug()) # print(block_witness2.debug()) # print(block_witness3.debug()) alignment_table = collate_pretokenized_json(pretokenized_json) print(alignment_table)