Better normalization hopefully.

This commit is contained in:
Sergiotarxz 2023-09-05 17:08:37 +02:00
parent 96b403ff87
commit 7c5c94b45a

View File

@ -16,7 +16,13 @@ sub normalize($self, $text) {
return undef if !defined $text; return undef if !defined $text;
my $decomposed = NFKD( $text ); my $decomposed = NFKD( $text );
$decomposed =~ s/\p{NonspacingMark}//g; $decomposed =~ s/\p{NonspacingMark}//g;
$decomposed =~ s/s\b//g; $decomposed =~ s/(?:
ada|ado|aje|cion|diccion|duccion|dura|ección|epcion|ido|ion|miento|
ncia|on|scripcion|sicion|sion|dad|tad|bilidad|edad|era|eria|ez|eza|ia|idad|ismo|
ncia|ante|ente|ura|dor|dero|ero|ista|ado|ario|ia|ero|eria|able|aceo|aco|al|aneo|
ante|ario|ente|rgir|ento|errimo|ible|ico|ífico|il|ino|ísimo|ivo|izo|oso|ear|ecer
ificar|izar|es|as|os|e|o|a
)\b//xg;
$decomposed =~ s/a\b/o/g; $decomposed =~ s/a\b/o/g;
return $decomposed; return $decomposed;
} }