Character-Level Text Noise-Adding System

From GM-RKB
Jump to navigation Jump to search

A Character-Level Text Noise-Adding System is a text noise-adding system that implements a character-level noise-adding algorithm (which performs random character-level edits) to solve a character-level text noise-adding task.



References

2018

# CLTnoiseadder: A character-level text noise adding system # TODO: # * create a module version # * support for biased selection based on a character frequency in a corpus. import sys import string import random import numpy.random noise_level = 0.08 swap_rate = 0.33 delete_rate = 0.33 CHARS=string.printable def levenshtein(s1, s2): if len(s1) < len(s2): return levenshtein(s2, s1) # len(s1) >= len(s2) if len(s2) == 0: return len(s1) previous_row = range(len(s2) + 1) for i, c1 in enumerate(s1): current_row = [i + 1] for j, c2 in enumerate(s2): insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer deletions = current_row[j] + 1 # than s2 substitutions = previous_row[j] + (c1 != c2) current_row.append(min(insertions, deletions, substitutions)) previous_row = current_row return previous_row[-1] # Create the frequency of characters (for now uniform). # TODO: base the distribution on some corpus i=0 character_distribution = [] for char in CHARS: character_distribution = character_distribution + [1.0/len(CHARS)] i+=1https://www.gabormelli.com/RKB/Character-Level_Text_Noise-Adding_System # Create a noisy line noisy_line = "" for original_line in sys.stdin: characters = list(original_line) i = 0 while i < len(characters): if random.random() < noise_level: # swap if i<len(characters)-1 and random.random() <= swap_rate: noisy_line += characters[i+1] noisy_line += characters[i] i+=1 # delete elif (random.random() <= delete_rate): i+=1 # add else: # noisy_character = numpy.random.choice(list(CHARS), 1)[0] noisy_character = numpy.random.choice(list(CHARS), 1, p=character_distribution)[0] noisy_line += noisy_character else: noisy_line += characters[i] i+=1 ; # print it print (noisy_line) # measure the difference print (levenshtein(original_line, noisy_line))

2017

def add_noise_to_string(a_string, amount_of_noise):
   """Add some artificial spelling mistakes to the string"""
   if rand() < amount_of_noise * len(a_string):
       # Replace a character with a random character
       random_char_position = random_randint(len(a_string))
       a_string = a_string[:random_char_position] + random_choice(CHARS[:-1]) + a_string[random_char_position + 1:]
   if rand() < amount_of_noise * len(a_string):
       # Delete a character
       random_char_position = random_randint(len(a_string))
       a_string = a_string[:random_char_position] + a_string[random_char_position + 1:]
   if len(a_string) < CONFIG.max_input_len and rand() < amount_of_noise * len(a_string):
       # Add a random character
       random_char_position = random_randint(len(a_string))
       a_string = a_string[:random_char_position] + random_choice(CHARS[:-1]) + a_string[random_char_position:]
   if rand() < amount_of_noise * len(a_string):
       # Transpose 2 characters
       random_char_position = random_randint(len(a_string) - 1)
       a_string = (a_string[:random_char_position] + a_string[random_char_position + 1] + a_string[random_char_position] +
                   a_string[random_char_position + 2:])
return a_string