From 9f309be709880281b2f30f19d637e6a866ad4b04 Mon Sep 17 00:00:00 2001 From: sergiotarxz Date: Tue, 5 Sep 2023 19:29:30 +0200 Subject: [PATCH] Adding a Stemmer insteado of suffixes. --- Build.PL | 1 + lib/BurguillosInfo/IndexUtils.pm | 50 +++++++------------------------- 2 files changed, 11 insertions(+), 40 deletions(-) diff --git a/Build.PL b/Build.PL index 1b7570a..5d0170a 100755 --- a/Build.PL +++ b/Build.PL @@ -25,6 +25,7 @@ my $build = Module::Build->new( 'Moo::Role' => 0, 'Module::Pluggable' => 0, 'List::AllUtils' => 0, + 'Lingua::Stem::Snowball' => 0, }, ); $build->create_build_script; diff --git a/lib/BurguillosInfo/IndexUtils.pm b/lib/BurguillosInfo/IndexUtils.pm index 828be48..a6c78ef 100644 --- a/lib/BurguillosInfo/IndexUtils.pm +++ b/lib/BurguillosInfo/IndexUtils.pm @@ -12,49 +12,19 @@ use Unicode::Normalize qw/NFKD/; use Moo; -sub normalize ( $self, $text ) { +use Lingua::Stem::Snowball; + +sub normalize($self, $text) { return undef if !defined $text; my $decomposed = NFKD($text); $decomposed =~ s/\p{NonspacingMark}//g; - $decomposed =~ s/es\b//g; - $decomposed =~ s/as\b//g; - $decomposed =~ s/os\b//g; - $decomposed =~ s/e\b//g; - $decomposed =~ s/o\b//g; - $decomposed =~ s/a\b//g; - $decomposed =~ s/i\b//g; - $decomposed =~ s/cion\b//g; - $decomposed =~ s/diccion\b//g; - $decomposed =~ s/duccion\b//g; - $decomposed =~ s/dur\b//g; - $decomposed =~ s/eccion\b//g; - $decomposed =~ s/epcion\b//g; - $decomposed =~ s/mient\b//g; - $decomposed =~ s/scripcion\b//g; - $decomposed =~ s/sicion\b//g; - $decomposed =~ s/sion\b//g; - $decomposed =~ s/dad\b//g; - $decomposed =~ s/tad\b//g; - $decomposed =~ s/bilidad\b//g; - $decomposed =~ s/edad\b//g; - $decomposed =~ s/idad\b//g; - $decomposed =~ s/ism\b//g; - $decomposed =~ s/ant\b//g; - $decomposed =~ s/ent\b//g; - $decomposed =~ s/dor\b//g; - $decomposed =~ s/der\b//g; - $decomposed =~ s/ist\b//g; - $decomposed =~ s/abl\b//g; - $decomposed =~ s/ant\b//g; - $decomposed =~ s/ent\b//g; - $decomposed =~ s/rgir\b//g; - $decomposed =~ s/ent\b//g; - $decomposed =~ s/errim\b//g; - $decomposed =~ s/ibl\b//g; - $decomposed =~ s/ific\b//g; - $decomposed =~ s/isim\b//g; - $decomposed =~ s/ecer\b//g; - $decomposed =~ s/ific\b//g; + my @words; + while ($decomposed =~ /\b(\w+)\b/g) { + push @words, $1; + } + my $stemmer = Lingua::Stem::Snowball->new( lang => 'es' ); + $stemmer->stem_in_place(\@words); + $decomposed = join " ", @words; return $decomposed; }