Character-Level Text Noise-Adding System
A Character-Level Text Noise-Adding System is a text noise-adding system that implements a character-level noise-adding algorithm (which performs random character-level edits) to solve a character-level text noise-adding task.
- Context:
- It can be used to evaluated a Text Error Correction System (such as a character-level TEC system).
- Example(s):
- a Python-based Character-Level Noise-Adding System, such as: the one in
keras_spell.py
. - …
- a Python-based Character-Level Noise-Adding System, such as: the one in
- Counter-Example(s):
- See: Text String.
References
2018
# CLTnoiseadder: A character-level text noise adding system
# TODO:
# * create a module version
# * support for biased selection based on a character frequency in a corpus.
import sys
import string
import random
import numpy.random
noise_level = 0.08
swap_rate = 0.33
delete_rate = 0.33
CHARS=string.printable
def levenshtein(s1, s2):
if len(s1) < len(s2):
return levenshtein(s2, s1)
# len(s1) >= len(s2)
if len(s2) == 0:
return len(s1)
previous_row = range(len(s2) + 1)
for i, c1 in enumerate(s1):
current_row = [i + 1]
for j, c2 in enumerate(s2):
insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer
deletions = current_row[j] + 1 # than s2
substitutions = previous_row[j] + (c1 != c2)
current_row.append(min(insertions, deletions, substitutions))
previous_row = current_row
return previous_row[-1]
# Create the frequency of characters (for now uniform).
# TODO: base the distribution on some corpus
i=0
character_distribution = []
for char in CHARS:
character_distribution = character_distribution + [1.0/len(CHARS)]
i+=1https://www.gabormelli.com/RKB/Character-Level_Text_Noise-Adding_System
# Create a noisy line
noisy_line = ""
for original_line in sys.stdin:
characters = list(original_line)
i = 0
while i < len(characters):
if random.random() < noise_level:
# swap
if i<len(characters)-1 and random.random() <= swap_rate:
noisy_line += characters[i+1]
noisy_line += characters[i]
i+=1
# delete
elif (random.random() <= delete_rate):
i+=1
# add
else:
# noisy_character = numpy.random.choice(list(CHARS), 1)[0]
noisy_character = numpy.random.choice(list(CHARS), 1, p=character_distribution)[0]
noisy_line += noisy_character
else:
noisy_line += characters[i]
i+=1 ;
# print it
print (noisy_line)
# measure the difference
print (levenshtein(original_line, noisy_line))
2017
def add_noise_to_string(a_string, amount_of_noise): """Add some artificial spelling mistakes to the string""" if rand() < amount_of_noise * len(a_string): # Replace a character with a random character random_char_position = random_randint(len(a_string)) a_string = a_string[:random_char_position] + random_choice(CHARS[:-1]) + a_string[random_char_position + 1:] if rand() < amount_of_noise * len(a_string): # Delete a character random_char_position = random_randint(len(a_string)) a_string = a_string[:random_char_position] + a_string[random_char_position + 1:] if len(a_string) < CONFIG.max_input_len and rand() < amount_of_noise * len(a_string): # Add a random character random_char_position = random_randint(len(a_string)) a_string = a_string[:random_char_position] + random_choice(CHARS[:-1]) + a_string[random_char_position:] if rand() < amount_of_noise * len(a_string): # Transpose 2 characters random_char_position = random_randint(len(a_string) - 1) a_string = (a_string[:random_char_position] + a_string[random_char_position + 1] + a_string[random_char_position] + a_string[random_char_position + 2:]) return a_string