Spaces:

JEdward7777
/

SentenceTransmorgrifier

Runtime error

SentenceTransmorgrifier / transmorgrify.py

Joshua Lansford

Fix Target contains only one unique value error (Issue #1)

2f7253d about 1 year ago

27.7 kB

	#!/usr/bin/env python3
	import argparse
	import json
	import os
	import zipfile

	import pandas as pd
	from catboost import CatBoostClassifier, Pool

	MATCH = 0
	DELETE_FROM = 1
	INSERT_TO = 2
	START = 3

	FILE_VERSION = 1

	class Transmorgrifier:
	def train( self, from_sentences, to_sentences, iterations = 4000, device = 'cpu', trailing_context = 7, leading_context = 7, verbose=True ):
	"""
	Train the Transmorgrifier model. This does not save it to disk but just trains in memory.

	Keyword arguments:
	from_sentences -- An array of strings for the input sentences.
	to_sentences -- An array of strings of the same length as from_sentences which the model is to train to convert to.
	iterations -- An integer specifying the number of iterations to convert from or to. (default 4000)
	device -- The gpu reference which catboost wants or "cpu". (default cpu)
	trailing_context -- The number of characters after the action point to include for context. (default 7)
	leading_context -- The number of characters before the action point to include for context. (default 7)
	verbose -- Increased the amount of text output during training. (default True)
	"""
	X,Y = _parse_for_training( from_sentences, to_sentences, num_pre_context_chars=leading_context, num_post_context_chars=trailing_context )

	#train and save the action_model
	self.action_model = _train_catboost( X, Y['action'], iterations, verbose=verbose, device=device, model_piece='action' )

	#and the char model
	#slice through where only the action is insert.
	insert_indexes = Y['action'] == INSERT_TO

	#if there is only one char to insert, we can't train the second model and need to handle that as a boundary case.
	if Y['char'][insert_indexes].nunique() > 1:
	self.char_model = _train_catboost( X[insert_indexes], Y['char'][insert_indexes], iterations, verbose=verbose, device=device, model_piece='char' )
	self.constant_output = None
	else:
	self.char_model = None
	if Y['char'][insert_indexes].nunique() == 1:
	self.constant_output = Y['char'][insert_indexes].unique()[0]
	else:
	#If there is never an insertion handle it as always inserting a space,
	#because it will never insert, but it handles the boundary case so the saving and loading code works.
	self.constant_output = ' '



	self.trailing_context = trailing_context
	self.leading_context = leading_context
	self.iterations = iterations

	return self

	def save( self, model='my_model.tm' ):
	"""
	Saves the model previously trained with train to a specified model file.

	Keyword arguments:
	model -- The pathname to save the model such as "my_model.tm" (default my_model.tm)
	"""
	self.name = model
	with zipfile.ZipFile( model, mode="w", compression=zipfile.ZIP_DEFLATED, compresslevel=9 ) as my_zip:
	with my_zip.open( 'params.json', mode='w' ) as out:
	params = {
	'version': FILE_VERSION,
	'leading_context': self.leading_context,
	'trailing_context': self.trailing_context,
	'iterations': self.iterations,
	}
	if self.constant_output is not None:
	params['constant_output'] = self.constant_output

	out.write( json.dumps(params).encode())
	temp_filename = _mktemp()
	self.action_model.save_model( temp_filename )
	my_zip.write( temp_filename, "action.cb" )
	if not self.char_model is None:
	self.char_model.save_model( temp_filename )
	my_zip.write( temp_filename, "char.cb" )
	os.unlink( temp_filename )

	return self

	def load( self, model='my_model.tm' ):
	"""
	Loads the model previously saved from the file system.

	Keyword arguments:
	model -- The filename of the model to load. (default my_model.tm)
	"""
	self.name = model
	with zipfile.ZipFile( model, mode='r' ) as zip:
	with zip.open( 'params.json' ) as f_in:
	params = json.loads( f_in.read().decode() )
	if params['version'] > FILE_VERSION: raise Exception( f"Version {params['version']} greater than {FILE_VERSION}" )
	self.leading_context = int(params['leading_context'])
	self.trailing_context = int(params['trailing_context'])
	self.iterations = int(params['iterations'])
	temp_filename = _mktemp()
	with zip.open( 'action.cb' ) as f_in:
	with open( temp_filename, "wb" ) as f_out:
	f_out.write( f_in.read() )
	self.action_model = CatBoostClassifier().load_model( temp_filename )
	if 'constant_output' not in params:
	with zip.open( 'char.cb' ) as f_in:
	with open( temp_filename, "wb" ) as f_out:
	f_out.write( f_in.read() )
	self.char_model = CatBoostClassifier().load_model( temp_filename )
	self.constant_output = None
	else:
	self.constant_output = params['constant_output']
	self.char_model = None

	os.unlink( temp_filename)

	return self


	def execute( self, from_sentences, verbose=False ):
	"""
	Runs the data from from_sentences. The results are returned
	using yield so you need to wrap this in list() if you want
	to index it. from_sentences can be an array or a generator.

	Keyword arguments:
	from_sentences -- Something iterable which returns strings.
	"""
	for i,from_sentence in enumerate(from_sentences):

	yield _do_reconstruct(
	action_model=self.action_model,
	char_model=self.char_model,
	constant_output=self.constant_output,
	text=from_sentence,
	num_pre_context_chars=self.leading_context,
	num_post_context_chars=self.trailing_context )
	if verbose and i % 10 == 0:
	print( f"{i} of {len(from_sentences)}" )

	def demo( self, share=False ):
	import gradio as gr

	def gradio_function( text ):
	return list(self.execute( [text] ))[0]

	with gr.Blocks() as demo:
	name = gr.Markdown( self.name )
	inp = gr.Textbox( label="Input" )
	out = gr.Textbox( label="Output" )
	inp.change( gradio_function, inputs=[inp], outputs=[out] )
	demo.launch( share=share )

	def _list_trace( trace ):
	if trace.parent is None:
	result = [trace]
	else:
	result = _list_trace( trace.parent )
	result.append( trace )
	return result

	class _edit_trace_hop():
	parent = None
	edit_distance = None
	char = None
	from_row_i = None
	to_column_i = None
	action = None

	def __str__( self ):
	if self.action == START:
	return "<start>"
	elif self.action == INSERT_TO:
	return f"<ins> {self.char}"
	elif self.action == DELETE_FROM:
	return f"<del> {self.char}"
	elif self.action == MATCH:
	return f"<match> {self.char}"
	return "eh?"

	def __repr__( self ):
	return self.__str__()

	def _trace_edits( from_sentence, to_sentence, print_debug=False ):
	#iterating from will be the rows down the left side.
	#iterating to will be the columns across the top.
	#we will keep one row as we work on the next.

	last_row = None
	current_row = []

	#the index handles one before the index in the string
	#to handle the root cases across the top and down the left of the
	#match matrix.
	for from_row_i in range( len(from_sentence)+1 ):

	for to_column_i in range( len(to_sentence )+1 ):

	best_option = None

	#root case.
	if from_row_i == 0 and to_column_i == 0:
	best_option = _edit_trace_hop()
	best_option.parent = None
	best_option.edit_distance = 0
	best_option.char = ""
	best_option.from_row_i = from_row_i
	best_option.to_column_i = to_column_i
	best_option.action = START

	#check left
	if to_column_i > 0:
	if best_option is None or current_row[to_column_i-1].edit_distance + 1 < best_option.edit_distance:
	best_option = _edit_trace_hop()
	best_option.parent = current_row[to_column_i-1]
	best_option.edit_distance = best_option.parent.edit_distance + 1
	best_option.char = to_sentence[to_column_i-1]
	best_option.from_row_i = from_row_i
	best_option.to_column_i = to_column_i
	best_option.action = INSERT_TO

	#check up
	if from_row_i > 0:
	if best_option is None or last_row[to_column_i].edit_distance + 1 < best_option.edit_distance:
	best_option = _edit_trace_hop()
	best_option.parent = last_row[to_column_i]
	best_option.edit_distance = best_option.parent.edit_distance + 1
	best_option.char = from_sentence[from_row_i-1]
	best_option.from_row_i = from_row_i
	best_option.to_column_i = to_column_i
	best_option.action = DELETE_FROM

	#check match
	if to_column_i > 0:
	if to_sentence[to_column_i-1] == from_sentence[from_row_i-1]:
	if best_option is None or last_row[to_column_i-1].edit_distance <= best_option.edit_distance: #prefer match so use <= than <
	best_option = _edit_trace_hop()
	best_option.parent = last_row[to_column_i-1]
	best_option.edit_distance = best_option.parent.edit_distance + 1
	best_option.char = from_sentence[from_row_i-1]
	best_option.from_row_i = from_row_i
	best_option.to_column_i = to_column_i
	best_option.action = MATCH

	if best_option is None: raise Exception( "Shouldn't end up with best_option being None" )
	current_row.append(best_option)

	last_row = current_row
	current_row = []

	if print_debug:
	def print_diffs( current_node ):
	if current_node.parent is not None:
	print_diffs( current_node.parent )

	if current_node.action == START:
	print( "start" )
	elif current_node.action == MATCH:
	print( f"match {current_node.char}" )
	elif current_node.action == INSERT_TO:
	print( f"insert {current_node.char}" )
	elif current_node.action == DELETE_FROM:
	print( f"del {current_node.char}" )
	print_diffs( last_row[-1] )
	return last_row[-1]


	def _parse_single_for_training( from_sentence, to_sentence, num_pre_context_chars, num_post_context_chars ):
	trace = _trace_edits( from_sentence, to_sentence )

	#we will collect a snapshot at each step.
	trace_list = _list_trace(trace)


	training_collection = []

	#execute these things on the from_sentence and see if we get the to_sentence.
	working_from = from_sentence
	working_to = ""
	used_from = ""
	continuous_added = 0
	continuous_dropped = 0
	for thing in trace_list:
	#gather action and context for training
	if thing.action != START:
	from_context = (working_from + (" " * num_post_context_chars))[:num_post_context_chars]
	to_context = ((" " * num_pre_context_chars) + working_to )[-num_pre_context_chars:]
	used_context = ((" " * num_pre_context_chars) + used_from )[-num_pre_context_chars:]

	training_collection.append({
	"from_context": from_context,
	"to_context": to_context,
	"used_context": used_context,
	"action": thing.action,
	"continuous_added": continuous_added,
	"continuous_dropped": continuous_dropped,
	"char": thing.char if thing.action == INSERT_TO else ' ',
	})

	#now execute the action for the next step.
	if thing.action == START:
	pass
	elif thing.action == INSERT_TO:
	working_to += thing.char
	continuous_added += 1
	continuous_dropped = 0
	elif thing.action == DELETE_FROM:
	used_from += working_from[0]
	working_from = working_from[1:]
	continuous_added = 0
	continuous_dropped += 1
	elif thing.action == MATCH:
	used_from += working_from[0]
	working_to += working_from[0]
	working_from = working_from[1:]
	continuous_added = 0
	continuous_dropped = 0


	if to_sentence != working_to:
	print( "Replay failure" )

	#so now I have training_collection which is a list of dictionaries where each dictionary is an action with a context.
	#I need to change it into a dictionary of lists where each dictionary a column and the lists are the rows.
	context_split_into_dict = {}

	#first collect the from_context:
	for i in range( num_post_context_chars ):
	this_slice = []
	for training in training_collection:
	this_slice.append( training['from_context'][i] )
	context_split_into_dict[ f"f{i}" ] = this_slice

	#now collect to_context:
	for i in range( num_pre_context_chars ):
	this_slice = []
	for training in training_collection:
	this_slice.append( training['to_context'][i] )
	context_split_into_dict[ f"t{i}" ] = this_slice

	#now collect used_context
	for i in range( num_pre_context_chars ):
	this_slice = []
	for training in training_collection:
	this_slice.append( training['used_context'][i] )
	context_split_into_dict[ f"u{i}" ] = this_slice


	#now these two things.
	context_split_into_dict["continuous_added"] = []
	context_split_into_dict["continuous_dropped"] = []
	for training in training_collection:
	context_split_into_dict["continuous_added"].append( training["continuous_added"] )
	context_split_into_dict["continuous_dropped"].append( training["continuous_dropped"] )

	#now also collect the output answers.
	result_split_into_dict = {}
	action_slice = []
	char_slice = []
	for training in training_collection:
	action_slice.append( training['action'] )
	char_slice.append( training['char'] )
	result_split_into_dict['action'] = action_slice
	result_split_into_dict['char'] = char_slice

	#now return it as a data_frame.
	return pd.DataFrame( context_split_into_dict ), pd.DataFrame( result_split_into_dict )


	def _parse_for_training( from_sentences, to_sentences, num_pre_context_chars, num_post_context_chars ):
	out_observations_list = []
	out_results_list = []

	for index, (from_sentence, to_sentence) in enumerate(zip( from_sentences, to_sentences )):
	if type(from_sentence) != float and type(to_sentence) != float: #bad lines are nan which are floats.
	specific_observation, specific_result = _parse_single_for_training( from_sentence, to_sentence, num_pre_context_chars=num_pre_context_chars, num_post_context_chars=num_post_context_chars )

	out_observations_list.append( specific_observation )
	out_results_list.append( specific_result )
	if index % 100 == 0:
	print( f"parsing {index} of {len(from_sentences)}")

	return pd.concat( out_observations_list ), pd.concat( out_results_list )

	def _train_catboost( X, y, iterations, device, verbose, model_piece, learning_rate = .07 ):

	X = X.fillna( ' ' )
	passed = False
	while not passed:
	train_pool = Pool(
	data=X,
	label=y,
	cat_features=[i for i,x in enumerate(X.keys()) if x[0] in ['f','t','u']] #watchout if another field is added that it doesn't start with one of these.
	)
	validation_pool = None #Can't use validation pool because it randomly has chars not in training.
	model = CatBoostClassifier(
	iterations = iterations,
	learning_rate = learning_rate,
	task_type="GPU" if device.lower() != 'cpu' else "CPU",
	devices=device if device.lower() != 'cpu' else None
	)
	model.fit( train_pool, eval_set=validation_pool, verbose=True )
	passed = True

	if( verbose ): print( '{} is fitted: {}'.format(model_piece,model.is_fitted()))
	if( verbose ): print( '{} params:\n{}'.format(model_piece,model.get_params()))

	return model



	def _mktemp():
	#I know mktemp exists in the library but it has been deprecated suggesting using
	#mkstemp but catboost can't write to a file handle yet, so I need an actual
	#filename.
	number = 0
	while os.path.exists( f".temp_{number}~" ):
	number += 1
	return f".temp_{number}~"


	def predict_wrapper( model, model_input ):
	#Big hack. Catboost has shown itself to be unstable on producing
	#either a single value or an array with a single value in it.
	#I traced it back to the saved model, and then the model to what
	#data it is trained on. But I could figure out what it was
	#in the data which would make the saved model be one way or the other
	#so I am going to use the results this way so that it works either way.
	result = model.predict( model_input )[0]
	try:
	result = result[0]
	except:
	pass
	return result

	def _do_reconstruct( action_model, char_model, constant_output, text, num_pre_context_chars, num_post_context_chars ):

	#test for nan.
	if text != text: text = ''

	working_from = text
	working_to = ""
	used_from = ""
	continuous_added = 0
	continuous_dropped = 0
	while working_from and len(working_to) < 3len(text) and (len(working_to) < 5 or working_to[-5:] != (working_to[-1] 5)):
	from_context = (working_from + (" " * num_post_context_chars))[:num_post_context_chars]
	to_context = ((" " * num_pre_context_chars) + working_to )[-num_pre_context_chars:]
	used_context = ((" " * num_pre_context_chars) + used_from )[-num_pre_context_chars:]

	#construct the context.
	context_as_dictionary = {}
	#from_context
	for i in range( num_post_context_chars ):
	context_as_dictionary[ f"f{i}" ] = [from_context[i]]
	#to_context
	for i in range( num_pre_context_chars ):
	context_as_dictionary[ f"t{i}" ] = [to_context[i]]
	#used_context
	for i in range( num_pre_context_chars ):
	context_as_dictionary[ f"u{i}" ] = [used_context[i]]
	#these two things.
	context_as_dictionary["continuous_added"] = [continuous_added]
	context_as_dictionary["continuous_dropped"] = [continuous_dropped]

	#make it a pandas.
	context_as_pd = pd.DataFrame( context_as_dictionary )

	#run the model
	action_model_result = predict_wrapper(action_model,context_as_pd )

	#stop run away. If we have added more chars then our context, nothing is going to change.
	if action_model_result == INSERT_TO and continuous_added >= num_post_context_chars:
	#I can set this to MATCH or DELETE_FROM, but it is already a wreck, lets just see what happens with this.
	action_model_result = MATCH

	if action_model_result == START:
	pass
	elif action_model_result == INSERT_TO:
	if constant_output is None:
	#for an insert ask the char model what to insert
	char_model_result = predict_wrapper(char_model, context_as_pd )
	else:
	char_model_result = constant_output

	working_to += char_model_result
	continuous_added += 1
	continuous_dropped = 0
	elif action_model_result == DELETE_FROM:
	used_from += working_from[0]
	working_from = working_from[1:]
	continuous_added = 0
	continuous_dropped += 1
	elif action_model_result == MATCH:
	used_from += working_from[0]
	working_to += working_from[0]
	working_from = working_from[1:]
	continuous_added = 0
	continuous_dropped = 0

	return working_to


	#edit distance from https://stackoverflow.com/a/32558749/1419054
	def _levenshteinDistance(s1, s2):
	if s1 != s1: s1 = ''
	if s2 != s2: s2 = ''
	if len(s1) > len(s2):
	s1, s2 = s2, s1

	distances = range(len(s1) + 1)
	for i2, c2 in enumerate(s2):
	distances_ = [i2+1]
	for i1, c1 in enumerate(s1):
	if c1 == c2:
	distances_.append(distances[i1])
	else:
	distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
	distances = distances_
	return distances[-1]

	def train( in_csv, a_header, b_header, model, iterations, device, leading_context, trailing_context, train_percentage, verbose ):
	if verbose: print( "loading csv" )
	full_data = pd.read_csv( in_csv )

	split_index = int( train_percentage/100*len(full_data) )
	train_data = full_data.iloc[:split_index,:].reset_index(drop=True)

	if verbose: print( "parsing data for training" )


	tm = Transmorgrifier()

	tm.train( from_sentences=train_data[a_header],
	to_sentences=train_data[b_header],
	iterations = iterations,
	device = device,
	leading_context = leading_context,
	trailing_context = trailing_context,
	verbose=verbose,
	)
	tm.save( model )

	def execute( include_stats, in_csv, out_csv, a_header, b_header, model, execute_percentage, verbose ):
	if verbose: print( "loading csv" )

	full_data = pd.read_csv( in_csv )

	split_index = int( (100-execute_percentage)/100*len(full_data) )
	execute_data = full_data.iloc[split_index:,:].reset_index(drop=True)


	tm = Transmorgrifier()
	tm.load( model )

	results = list(tm.execute( execute_data[a_header ], verbose=verbose ))


	if include_stats:
	before_edit_distances = []
	after_edit_distances = []
	percent_improvement = []

	for row in range(len( execute_data )):
	before_edit_distances.append(
	_levenshteinDistance( execute_data[a_header][row], execute_data[b_header][row] )
	)
	after_edit_distances.append(
	_levenshteinDistance( results[row], execute_data[b_header][row] )
	)
	percent_improvement.append(
	100*(before_edit_distances[row] - after_edit_distances[row])/max(1,before_edit_distances[row])
	)

	pd_results = pd.DataFrame( {
	"in_data": execute_data[a_header],
	"out_data": execute_data[b_header],
	"generated_data": results,
	"before_edit_distance": before_edit_distances,
	"after_edit_distance": after_edit_distances,
	"percent_improvement": percent_improvement,
	})
	pd_results.to_csv( out_csv )
	else:
	pd_results = pd.DataFrame( {
	"out_data": execute_data[b_header],
	})
	pd_results.to_csv( out_csv )

	def safe_float( str ):
	if str is not None:
	return float(str)
	return None #explicit None return.

	def main():
	parser = argparse.ArgumentParser(
	prog = 'transmorgrify.py',
	description = 'Converts text from one to another according to a model.',
	epilog = '(C) Joshua Lansford')
	parser.add_argument('-t', '--train', action='store_true', help='Train a model instead of executing a model')
	parser.add_argument('-e', '--execute', action='store_true', help='Use an existing trained model.')
	parser.add_argument('-g', '--gradio', action='store_true', help='Start a gradio demo with the selected model.' )
	parser.add_argument('-s', '--share', action='store_true', help="Share the gradio app with a temporary public URL." )
	parser.add_argument('-i', '--in_csv', help='The csv to read training or input data from', default='in.csv' )
	parser.add_argument('-o', '--out_csv', help='The csv to write conversion to', default='out.csv' )
	parser.add_argument('-a', '--a_header', help='The column header for training or transforming from', default="source" )
	parser.add_argument('-b', '--b_header', help='The column header for training the transformation to', default="target" )
	parser.add_argument('-m', '--model',help='The model file to create during training or use during transformation', default='model.tm' )
	parser.add_argument('-n', '--iterations', help='The number of iterations to train', default=2000 )
	parser.add_argument('-d', '--device', help='Which device, i.e. if using GPU', default='cpu' )
	parser.add_argument('-x', '--context', help='The number of leading and trailing chars to use as context', default=7 )
	parser.add_argument('-p', '--train_percentage', help="The percentage of data to train on, leaving the rest for testing.")
	parser.add_argument('-v', '--verbose', action='store_true', help='Talks alot?' )
	parser.add_argument('-c', '--include_stats', action='store_true', help='Use b_header to compute stats and add to output csv.')


	args = parser.parse_args()

	if not args.train and not args.execute and not args.gradio: print( "Must include --execute, --train and/or --gradio to do something." )


	if args.train:
	train_percentage = safe_float(args.train_percentage)
	if train_percentage is None:
	if args.execute:
	train_percentage = 50
	else:
	train_percentage = 100

	train( in_csv=args.in_csv,
	a_header=args.a_header,
	b_header=args.b_header,
	model=args.model,
	iterations=int(args.iterations),
	device=args.device,
	leading_context=int(args.context),
	trailing_context=int(args.context),
	train_percentage=train_percentage,
	verbose=args.verbose,
	)


	if args.execute:
	if args.train_percentage is None:
	if args.train:
	execute_percentage = 50
	else:
	execute_percentage = 100
	else:
	execute_percentage = 100-safe_float(args.train_percentage)
	execute(
	include_stats=args.include_stats,
	in_csv=args.in_csv,
	out_csv=args.out_csv,
	a_header=args.a_header,
	b_header=args.b_header,
	model=args.model,
	execute_percentage=execute_percentage,
	verbose=args.verbose,
	)


	if args.gradio:
	tm = Transmorgrifier()
	tm.load( args.model )

	tm.demo( share=args.share )


	if __name__ == '__main__':
	main()