From a1c19077567eced33ac1b9fa0989b00f80a19678 Mon Sep 17 00:00:00 2001 From: sergiotarxz Date: Fri, 5 May 2023 03:39:13 +0200 Subject: [PATCH] Adding the script to extract data from the awful genuino's websit. --- extract_carta_genuino.pl | 56 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 extract_carta_genuino.pl diff --git a/extract_carta_genuino.pl b/extract_carta_genuino.pl new file mode 100644 index 0000000..2ebe98b --- /dev/null +++ b/extract_carta_genuino.pl @@ -0,0 +1,56 @@ +#!/usr/bin/env perl + +use v5.36.0; + +use strict; +use warnings; + +use utf8; + +use Mojo::UserAgent; + +my $ua = Mojo::UserAgent->new; +my $base_url = "https://www.mundogenuino.eu"; +my $dom = $ua->get($base_url)->result->dom; +my $category_anchors = $dom->find('a.s123-fast-page-load'); +binmode STDOUT, ':utf8'; +for my $category_anchor ($category_anchors->each) { + my $title_category = $category_anchor->all_text; + my $url_category = $base_url . $category_anchor->attr('href'); + next unless $title_category; + $title_category =~ s/^\s+//; + $title_category =~ s/\s+$//; + say "$title_category"; + say "$url_category"; + my $dom_category = $ua->get($url_category)->result->dom; + my @product_containers = $dom_category->find('a.article-container')->each; + for my $product_container (@product_containers) { + my $url_product = $base_url.$product_container->attr('href'); + my $product_title = $product_container->at('h4')->all_text; + my $dom_product = $ua->get($url_product)->result->dom; + my $ingredients_tag = $dom_product->at('strong'); + my $ingredients = ''; + if (defined $ingredients_tag) { + $ingredients = $ingredients_tag->all_text; + } + my @prices; + my $i = 0; + my $product_text = $dom_product->all_text; + while ($product_text =~ /(\S+(\s*)€)/ug) { + my $price = $1; + $price =~ s/,/./g; + $price =~ s/\s//g; + push @prices, $price; + last if ++$i == 2; + } + if (!scalar @prices) { + my ($price) = $product_text =~ /(\d+,\d{2})(?:\s|$)/; + if (defined $price) { + $price =~ s/,/./g; + push @prices, $price.'€'; + } + } + say join '', map { "($_)" } $product_title, $url_product, $ingredients, @prices; + } + # say join "\n", $dom_category->find('h4')->map('all_text')->each; +}