|
import unittest |
|
|
|
from onmt.transforms.bart import word_start_finder |
|
from onmt.utils.alignment import subword_map_by_joiner, subword_map_by_spacer |
|
from onmt.constants import SubwordMarker |
|
|
|
|
|
class TestWordStartFinder(unittest.TestCase): |
|
def test_word_start_naive(self): |
|
word_start_finder_fn = word_start_finder(ignore_subword=True) |
|
data_in = [ |
|
"however", |
|
",", |
|
"according", |
|
"to", |
|
"the", |
|
"logs", |
|
",", |
|
"she", |
|
"is", |
|
"hard", |
|
"-", |
|
"working", |
|
".", |
|
] |
|
true_out = [ |
|
True, |
|
True, |
|
True, |
|
True, |
|
True, |
|
True, |
|
True, |
|
True, |
|
True, |
|
True, |
|
True, |
|
True, |
|
True, |
|
] |
|
out = word_start_finder_fn(data_in) |
|
self.assertEqual(out, true_out) |
|
|
|
def test_word_start_joiner(self): |
|
word_start_finder_fn = word_start_finder(is_joiner=True) |
|
data_in = [ |
|
"however", |
|
"■,", |
|
"according", |
|
"to", |
|
"the", |
|
"logs", |
|
"■,", |
|
"she", |
|
"is", |
|
"hard", |
|
"■-■", |
|
"working", |
|
"■.", |
|
] |
|
true_out = [ |
|
True, |
|
False, |
|
True, |
|
True, |
|
True, |
|
True, |
|
False, |
|
True, |
|
True, |
|
True, |
|
False, |
|
False, |
|
False, |
|
] |
|
out = word_start_finder_fn(data_in) |
|
self.assertEqual(out, true_out) |
|
|
|
def test_word_start_spacer(self): |
|
word_start_finder_fn = word_start_finder() |
|
data_in = [ |
|
"▁however", |
|
",", |
|
"▁according", |
|
"▁to", |
|
"▁the", |
|
"▁logs", |
|
",", |
|
"▁she", |
|
"▁is", |
|
"▁hard", |
|
"-", |
|
"working", |
|
".", |
|
] |
|
true_out = [ |
|
True, |
|
False, |
|
True, |
|
True, |
|
True, |
|
True, |
|
False, |
|
True, |
|
True, |
|
True, |
|
False, |
|
False, |
|
False, |
|
] |
|
out = word_start_finder_fn(data_in) |
|
self.assertEqual(out, true_out) |
|
|
|
no_dummy = [ |
|
"however", |
|
",", |
|
"▁according", |
|
"▁to", |
|
"▁the", |
|
"▁logs", |
|
",", |
|
"▁she", |
|
"▁is", |
|
"▁hard", |
|
"-", |
|
"working", |
|
".", |
|
] |
|
no_dummy_out = word_start_finder_fn(no_dummy) |
|
self.assertEqual(no_dummy_out, true_out) |
|
|
|
|
|
class TestSubwordGroup(unittest.TestCase): |
|
def test_subword_group_joiner(self): |
|
data_in = [ |
|
"however", |
|
"■,", |
|
"according", |
|
"to", |
|
"the", |
|
"logs", |
|
"■,", |
|
"she", |
|
"is", |
|
"hard", |
|
"■-■", |
|
"working", |
|
"■.", |
|
] |
|
true_out = [0, 0, 1, 2, 3, 4, 4, 5, 6, 7, 7, 7, 7] |
|
out = subword_map_by_joiner(data_in, marker=SubwordMarker.JOINER) |
|
self.assertEqual(out, true_out) |
|
|
|
def test_subword_group_joiner_with_case_markup(self): |
|
data_in = [ |
|
"⦅mrk_case_modifier_C⦆", |
|
"however", |
|
"■,", |
|
"according", |
|
"to", |
|
"the", |
|
"logs", |
|
"■,", |
|
"⦅mrk_begin_case_region_U⦆", |
|
"she", |
|
"is", |
|
"hard", |
|
"■-■", |
|
"working", |
|
"⦅mrk_end_case_region_U⦆", |
|
"■.", |
|
] |
|
true_out = [0, 0, 0, 1, 2, 3, 4, 4, 5, 5, 6, 7, 7, 7, 7, 7] |
|
out = subword_map_by_joiner(data_in, marker=SubwordMarker.JOINER) |
|
self.assertEqual(out, true_out) |
|
|
|
def test_subword_group_joiner_with_case_markup_advanced(self): |
|
data_in = [ |
|
"⦅mrk_case_modifier_C⦆", |
|
"dummy", |
|
"text", |
|
"⦅mrk_case_modifier_C⦆", |
|
"1■", |
|
"h■", |
|
"k", |
|
"⦅mrk_begin_case_region_U⦆", |
|
"th■", |
|
"⦅mrk_end_case_region_U⦆", |
|
"n", |
|
"more", |
|
"dummy", |
|
"text", |
|
] |
|
true_out = [0, 0, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 5, 6] |
|
out = subword_map_by_joiner(data_in, marker=SubwordMarker.JOINER) |
|
self.assertEqual(out, true_out) |
|
|
|
def test_subword_group_joiner_prior_tokenization(self): |
|
data_in = [ |
|
"⦅mrk_case_modifier_C⦆", |
|
"how■", |
|
"ever", |
|
"■,", |
|
"according", |
|
"to", |
|
"the", |
|
"logs", |
|
"■,", |
|
"⦅mrk_begin_case_region_U⦆", |
|
"she", |
|
"is", |
|
"hard", |
|
"■-■", |
|
"working", |
|
"⦅mrk_end_case_region_U⦆", |
|
"■.", |
|
] |
|
original_data_in = [ |
|
"However", |
|
"■,", |
|
"according", |
|
"to", |
|
"the", |
|
"logs", |
|
"■,", |
|
"SHE", |
|
"IS", |
|
"HARD-WORKING", |
|
"■.", |
|
] |
|
true_out = [0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9, 9, 9, 9, 10] |
|
out = subword_map_by_joiner( |
|
data_in, marker=SubwordMarker.JOINER, original_subwords=original_data_in |
|
) |
|
self.assertEqual(out, true_out) |
|
|
|
def test_subword_group_joiner_prior_tokenization_harder(self): |
|
data_in = [ |
|
"⦅mrk_case_modifier_C⦆", |
|
"how■", |
|
"ever", |
|
"■,", |
|
"according", |
|
"to", |
|
"the", |
|
"logs", |
|
"■,", |
|
"⦅mrk_begin_case_region_U⦆", |
|
"she", |
|
"is", |
|
"hard", |
|
"■-■", |
|
"working", |
|
"⦅mrk_end_case_region_U⦆", |
|
"■.", |
|
] |
|
original_data_in = [ |
|
"⦅mrk_case_modifier_C⦆", |
|
"how■", |
|
"ever", |
|
"■,", |
|
"according", |
|
"to", |
|
"the", |
|
"logs", |
|
"■,", |
|
"⦅mrk_begin_case_region_U⦆", |
|
"she", |
|
"is", |
|
"hard", |
|
"■-■", |
|
"working", |
|
"⦅mrk_end_case_region_U⦆", |
|
"■.", |
|
] |
|
true_out = [ |
|
0, |
|
1, |
|
2, |
|
3, |
|
4, |
|
5, |
|
6, |
|
7, |
|
8, |
|
9, |
|
10, |
|
11, |
|
12, |
|
13, |
|
14, |
|
15, |
|
16, |
|
] |
|
out = subword_map_by_joiner( |
|
data_in, marker=SubwordMarker.JOINER, original_subwords=original_data_in |
|
) |
|
self.assertEqual(out, true_out) |
|
|
|
def test_subword_group_joiner_with_new_joiner(self): |
|
data_in = [ |
|
"⦅mrk_case_modifier_C⦆", |
|
"however", |
|
"■", |
|
",", |
|
"according", |
|
"to", |
|
"the", |
|
"logs", |
|
"■", |
|
",", |
|
"⦅mrk_begin_case_region_U⦆", |
|
"she", |
|
"is", |
|
"hard", |
|
"■", |
|
"-", |
|
"■", |
|
"working", |
|
"⦅mrk_end_case_region_U⦆", |
|
"■", |
|
".", |
|
] |
|
true_out = [ |
|
0, |
|
0, |
|
0, |
|
0, |
|
1, |
|
2, |
|
3, |
|
4, |
|
4, |
|
4, |
|
5, |
|
5, |
|
6, |
|
7, |
|
7, |
|
7, |
|
7, |
|
7, |
|
7, |
|
7, |
|
7, |
|
] |
|
out = subword_map_by_joiner(data_in, marker=SubwordMarker.JOINER) |
|
self.assertEqual(out, true_out) |
|
|
|
def test_subword_group_naive(self): |
|
data_in = [ |
|
"however", |
|
",", |
|
"according", |
|
"to", |
|
"the", |
|
"logs", |
|
",", |
|
"she", |
|
"is", |
|
"hard", |
|
"-", |
|
"working", |
|
".", |
|
] |
|
true_out = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] |
|
out = subword_map_by_joiner(data_in, marker=SubwordMarker.JOINER) |
|
self.assertEqual(out, true_out) |
|
|
|
def test_subword_group_spacer(self): |
|
data_in = [ |
|
"however", |
|
",", |
|
"▁according", |
|
"▁to", |
|
"▁the", |
|
"▁logs", |
|
",", |
|
"▁she", |
|
"▁is", |
|
"▁hard", |
|
"-", |
|
"working", |
|
".", |
|
] |
|
true_out = [0, 0, 1, 2, 3, 4, 4, 5, 6, 7, 7, 7, 7] |
|
out = subword_map_by_spacer(data_in, marker=SubwordMarker.SPACER) |
|
self.assertEqual(out, true_out) |
|
|
|
no_dummy = [ |
|
"however", |
|
",", |
|
"▁according", |
|
"▁to", |
|
"▁the", |
|
"▁logs", |
|
",", |
|
"▁she", |
|
"▁is", |
|
"▁hard", |
|
"-", |
|
"working", |
|
".", |
|
] |
|
no_dummy_out = subword_map_by_spacer(no_dummy, marker=SubwordMarker.SPACER) |
|
self.assertEqual(no_dummy_out, true_out) |
|
|
|
def test_subword_group_spacer_with_case_markup(self): |
|
data_in = [ |
|
"⦅mrk_case_modifier_C⦆", |
|
"▁however", |
|
",", |
|
"▁according", |
|
"▁to", |
|
"▁the", |
|
"▁logs", |
|
",", |
|
"▁⦅mrk_begin_case_region_U⦆", |
|
"▁she", |
|
"▁is", |
|
"▁hard", |
|
"-", |
|
"working", |
|
".", |
|
"▁⦅mrk_end_case_region_U⦆", |
|
] |
|
true_out = [0, 0, 0, 1, 2, 3, 4, 4, 5, 5, 6, 7, 7, 7, 7, 7] |
|
out = subword_map_by_spacer(data_in, marker=SubwordMarker.SPACER) |
|
self.assertEqual(out, true_out) |
|
|
|
def test_subword_group_spacer_with_spacer_new(self): |
|
data_in = [ |
|
"⦅mrk_case_modifier_C⦆", |
|
"▁", |
|
"however", |
|
",", |
|
"▁", |
|
"according", |
|
"▁", |
|
"to", |
|
"▁", |
|
"the", |
|
"▁", |
|
"logs", |
|
",", |
|
"▁", |
|
"⦅mrk_begin_case_region_U⦆", |
|
"▁", |
|
"she", |
|
"▁", |
|
"is", |
|
"▁", |
|
"hard", |
|
"-", |
|
"working", |
|
".", |
|
"▁", |
|
"⦅mrk_end_case_region_U⦆", |
|
] |
|
true_out = [ |
|
0, |
|
0, |
|
0, |
|
0, |
|
1, |
|
1, |
|
2, |
|
2, |
|
3, |
|
3, |
|
4, |
|
4, |
|
4, |
|
5, |
|
5, |
|
5, |
|
5, |
|
6, |
|
6, |
|
7, |
|
7, |
|
7, |
|
7, |
|
7, |
|
7, |
|
7, |
|
] |
|
out = subword_map_by_spacer(data_in, marker=SubwordMarker.SPACER) |
|
self.assertEqual(out, true_out) |
|
|
|
|
|
if __name__ == "__main__": |
|
unittest.main() |
|
|