From 20fc7ff6a4aeb3dbd58adfbc4e4000cd87aa67cc Mon Sep 17 00:00:00 2001 From: sergiotarxz Date: Mon, 25 Oct 2021 00:39:16 +0200 Subject: [PATCH] Migrating to readmng, mangafox is fallen almost always and does to much to stop scrappers. --- Makefile | 2 +- include/manga.h | 58 ++++++ include/mangafox.h | 2 - include/readmng.h | 2 + main.c | 141 ++++++++++---- mangafox.c | 460 --------------------------------------------- readmng.c | 153 +++++++++++++++ 7 files changed, 319 insertions(+), 499 deletions(-) delete mode 100644 include/mangafox.h create mode 100644 include/readmng.h delete mode 100644 mangafox.c create mode 100644 readmng.c diff --git a/Makefile b/Makefile index f1e62a5..673fbf8 100644 --- a/Makefile +++ b/Makefile @@ -6,4 +6,4 @@ LDFLAGS := $(shell pkg-config --libs ${LIBS}) CC_COMMAND := ${CC} ${INCDIR} ${CFLAGS} all: build build: - ${CC_COMMAND} mangafox.c main.c -o main ${LDFLAGS} -ggdb + ${CC_COMMAND} readmng.c manga.c main.c -o main ${LDFLAGS} -ggdb diff --git a/include/manga.h b/include/manga.h index c08dcaf..b0e6c6d 100644 --- a/include/manga.h +++ b/include/manga.h @@ -1,4 +1,62 @@ +#ifndef MANGA +#define MANGA + +#include +#include +#include +#ifndef PCRE2_CODE_UNIT_WIDTH +#define PCRE2_CODE_UNIT_WIDTH 8 +#include +#endif + +#define XML_COPY_NODE_RECURSIVE 2 | 1 + struct Manga { char *title; char *image_url; }; + +struct SplittedString { + struct String *substrings; + size_t n_strings; +}; + +struct String { + char *content; + size_t size; +}; + +char * +get_request (const char *url, gsize *size_response_text); +xmlNodePtr * +find_class (xmlNodePtr node, char *class, size_t *len, xmlNodePtr *nodes, + int return_on_first); +void +print_debug_nodes (const xmlDocPtr html_document, + xmlNodePtr *nodes, size_t nodes_len); +char * +get_attr (xmlNodePtr const node, const char *attr_name); +void +copy_substring(const char *origin, char *dest, size_t dest_len, size_t start, + size_t len); +int +has_class (const char *class_attribute, + const char *class_to_check); +struct SplittedString * +split(char *re_str, size_t re_str_size, const char *subject, size_t subject_size); +char * +alloc_string(size_t len); +void +splitted_string_free (struct SplittedString *splitted_string); +void +iterate_string_to_split(struct SplittedString *splitted_string, + pcre2_code *re, int *will_break, const char *subject, + size_t subject_size, size_t *start_pos, size_t *offset); +xmlXPathObjectPtr +get_nodes_xpath_expression (const xmlDocPtr document, char *xpath); +xmlNodePtr * +loop_search_class (const xmlNodePtr node, xmlNodePtr *nodes, + const char * class, size_t *len); +char * +copy_binary_data (const char *input, size_t size); +#endif diff --git a/include/mangafox.h b/include/mangafox.h deleted file mode 100644 index 1b4b439..0000000 --- a/include/mangafox.h +++ /dev/null @@ -1,2 +0,0 @@ -void -retrieve_mangafox_title(); diff --git a/include/readmng.h b/include/readmng.h new file mode 100644 index 0000000..4206249 --- /dev/null +++ b/include/readmng.h @@ -0,0 +1,2 @@ +struct Manga * +retrieve_readmng_title_mangas (size_t *const len); diff --git a/main.c b/main.c index ce235bd..b9ba9e8 100644 --- a/main.c +++ b/main.c @@ -1,63 +1,132 @@ #include #include -#include +#include +#include AdwHeaderBar * create_headerbar (GtkBox *box); GtkBox * create_main_box (AdwApplicationWindow *window); +GtkBox * +create_manga_container (); +AdwCarousel * +create_adw_caroulsel (GtkBox *box); -static void +void +fill_carousel_of_mangas (AdwCarousel *carousel); + +void activate (AdwApplication *app, - gpointer user_data) + gpointer user_data) { - GtkWidget *window = - adw_application_window_new (GTK_APPLICATION (app)); - GtkBox *box = create_main_box( - ADW_APPLICATION_WINDOW - (window)); - create_headerbar (box); + GtkWidget *window = + adw_application_window_new (GTK_APPLICATION (app)); + GtkBox *box = create_main_box( + ADW_APPLICATION_WINDOW + (window)); + AdwCarousel *carousel; + create_headerbar (box); - gtk_widget_show (window); + carousel = create_adw_caroulsel (box); + fill_carousel_of_mangas (carousel); + + gtk_widget_show (window); +} + +void +fill_carousel_of_mangas (AdwCarousel *carousel) { + struct Manga *mangas; + struct Manga *manga; + GtkBox *manga_container; + size_t len_mangas = 0; + + mangas = retrieve_readmng_title_mangas (&len_mangas); + for (int i = 0; iimage_url, &size_downloaded_image); + tmp_image = g_file_new_tmp ("mangareadertmpfileXXXXXX", + &iostream, + &error + ); + if (error) { + fprintf (stderr, "Unable to read file: %s\n", error->message); + return; + } + error = NULL; + g_output_stream_write (g_io_stream_get_output_stream (G_IO_STREAM (iostream)), + downloaded_image, size_downloaded_image, NULL, &error); + if (error) { + fprintf (stderr, "Unable to write file: %s\n", error->message); + return; + } + picture = gtk_picture_new_for_file (tmp_image); + gtk_box_append (manga_container, picture); + } +} + +GtkBox * +create_manga_container () { + GtkBox *manga_container; + manga_container = GTK_BOX (gtk_box_new( + GTK_ORIENTATION_HORIZONTAL, + 0)); + return manga_container; } GtkBox * create_main_box (AdwApplicationWindow *window) { - GtkWidget *box = gtk_box_new( - GTK_ORIENTATION_VERTICAL, - 10); - adw_application_window_set_content( - window, - box); - return GTK_BOX (box); + GtkWidget *box = gtk_box_new( + GTK_ORIENTATION_VERTICAL, + 10); + adw_application_window_set_content( + window, + box); + return GTK_BOX (box); +} + +AdwCarousel * +create_adw_caroulsel (GtkBox *box) { + GtkWidget *carousel = adw_carousel_new (); + gtk_box_append (box, carousel); + return ADW_CAROUSEL (carousel); } AdwHeaderBar * create_headerbar (GtkBox *box) { - GtkWidget *title = - adw_window_title_new ("Window", NULL); - GtkWidget *header = - adw_header_bar_new(); - adw_header_bar_set_title_widget( - ADW_HEADER_BAR (header), - GTK_WIDGET (title)); - gtk_box_append (GTK_BOX (box), header); + GtkWidget *title = + adw_window_title_new ("Window", NULL); + GtkWidget *header = + adw_header_bar_new(); + adw_header_bar_set_title_widget( + ADW_HEADER_BAR (header), + GTK_WIDGET (title)); + gtk_box_append (box, header); - return ADW_HEADER_BAR (header); + return ADW_HEADER_BAR (header); } -int + int main (int argc, - char **argv) + char **argv) { - AdwApplication *app; - retrieve_mangafox_title(); - int status; - app = adw_application_new ("org.mangareader", G_APPLICATION_FLAGS_NONE); - g_signal_connect (app, "activate", G_CALLBACK (activate), NULL); - status = g_application_run (G_APPLICATION (app), argc, argv); - g_object_unref (app); - return status; + AdwApplication *app; + int status; + + app = adw_application_new ("org.mangareader", G_APPLICATION_FLAGS_NONE); + g_signal_connect (app, "activate", G_CALLBACK (activate), NULL); + status = g_application_run (G_APPLICATION (app), argc, argv); + g_object_unref (app); + return status; } diff --git a/mangafox.c b/mangafox.c deleted file mode 100644 index d895c3e..0000000 --- a/mangafox.c +++ /dev/null @@ -1,460 +0,0 @@ -#include - -#include -#include -#include - -#ifndef PCRE2_CODE_UNIT_WIDTH -#define PCRE2_CODE_UNIT_WIDTH 8 -#include -#endif - -#include - -#define XML_COPY_NODE_RECURSIVE 2 | 1 - -const char *mangafox_url = -"https://mangafox.fun"; -struct String { - char *content; - size_t size; -}; - -struct SplittedString { - struct String *substrings; - size_t n_strings; -}; - -struct Manga * -parse_main_mangafox_page ( - const xmlDocPtr html_document, - const size_t *size); -xmlXPathObjectPtr -get_nodes_xpath_expression ( - const xmlDocPtr document, - char *xpath); -char * -alloc_string(size_t len); -void -copy_substring(const char *origin, char *dest, - size_t dest_len, size_t start, size_t len); -void -print_classes (const char *class_attribute, - size_t class_attribute_size); -int -has_class (const char *class_attribute, - const char *class_to_check); -void -splitted_string_free (struct SplittedString *splitted_string); - -struct SplittedString * -split(char *re_str, size_t re_str_size, const char *subject, - size_t subject_size); -void -iterate_string_to_split(struct SplittedString *splitted_string, pcre2_code *re, int *will_break, const char *subject, - size_t subject_size, size_t *start_pos, size_t *offset); -char * -get_request (const char *url, gsize *size_response_text); -xmlNodePtr * -loop_search_class(const xmlNodePtr node, xmlNodePtr *nodes, - const char * class, size_t *len); -void -print_debug_nodes (const xmlDocPtr html_document, - xmlNodePtr *nodes, size_t nodes_len); -xmlNodePtr * -find_all_manga_slide(const xmlDocPtr html_document, - size_t *len); -char * -get_attr (xmlNodePtr const node, const char *attr_name); -char * -get_manga_slide_cover(xmlNodePtr node); -char * -match_1 (char *re_str, char *subject); -xmlNodePtr -find_class(xmlNodePtr node, char *class); -char * -get_manga_slide_title(xmlNodePtr node); - -void -retrieve_mangafox_title () { - xmlDocPtr html_response; - gsize size_response_text; - char *response_text = get_request (mangafox_url, - &size_response_text); - html_response = htmlReadMemory (response_text, - size_response_text, - NULL, - NULL, - HTML_PARSE_RECOVER | HTML_PARSE_NODEFDTD - | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING - ); - size_t manga_size; - parse_main_mangafox_page (html_response, &manga_size); - xmlFreeDoc (html_response); - free (response_text); -} - -char * -get_request (const char *url, gsize *size_response_text) { - SoupSession *soup_session; - SoupMessage *msg; - GValue response = G_VALUE_INIT; - guint status; - - *size_response_text = 0; - - g_value_init (&response, G_TYPE_BYTES); - - soup_session = soup_session_new (); - msg = soup_message_new ("GET", url); - status = soup_session_send_message (soup_session, msg); - g_object_get_property( - G_OBJECT (msg), - "response-body-data", - &response); - - printf("%u\n", status); - const char *html_response = g_bytes_get_data ((GBytes *) - g_value_peek_pointer (&response), - size_response_text); - - char *return_value = g_strndup (html_response, *size_response_text); - - g_value_unset (&response); - g_object_unref (soup_session); - g_object_unref (msg); - - return return_value; -} - - -struct Manga * -parse_main_mangafox_page (const xmlDocPtr html_document, - const size_t *size) { - xmlNodePtr *nodes; - xmlNodePtr node; - size_t nodes_len = 0; - - nodes = find_all_manga_slide (html_document, &nodes_len); - print_debug_nodes (html_document, nodes, nodes_len); - for (int i = 0; i < nodes_len; i++) { - node = nodes[i]; - char *cover = get_manga_slide_cover(node); - if (cover) { - printf ("%s\n", cover); - } - char *title = get_manga_slide_title (node); - if (title) { - printf ("%s\n", title); - } - } - for (int i = 0; ichildren; child; child=child->next) { - char *attr = get_attr (child, "class"); - if (attr && has_class (attr, class)) { - return child; - } - if (node->children) { - xmlNodePtr child = node->children; - for (;child;child=child->next) { - xmlNodePtr result = find_class (child, class); - if (result) { - return result; - } - } - } - } - return NULL; -} - -char * -get_manga_slide_cover(xmlNodePtr node) { - for (xmlNodePtr child = node->children; child; child=child->next) { - char *attr = get_attr (child, "class"); - if (attr && has_class (attr, "m-slide-background")) { - char *style = get_attr (child, "style"); - char *match = match_1 ("background-image:url\\((.*?)\\)", style); - if (match) { - printf("%s\n", match); - return match; - } - } - } - return NULL; -} - -void -print_debug_nodes (const xmlDocPtr html_document, - xmlNodePtr *nodes, size_t nodes_len) { - xmlBufferPtr buffer = xmlBufferCreate (); - for (int i = 0; i < nodes_len; i++) { - xmlNodeDump (buffer, html_document, nodes[i], - 0, 1); - } - xmlBufferDump (stdout, buffer); - xmlBufferFree (buffer); -} - -xmlNodePtr * -find_all_manga_slide(const xmlDocPtr html_document, - size_t *len) { - xmlNodeSetPtr node_set; - xmlNodePtr *nodes; - xmlXPathObjectPtr xpath_result; - - node_set = NULL; - nodes = NULL; - xpath_result = get_nodes_xpath_expression (html_document, - "//div[@class]"); - - if (!xpath_result) { - fprintf(stderr, "Empty xpath result\n"); - goto cleanup_find_all_manga_slide; - } - node_set = xpath_result->nodesetval; - if (!node_set) { - fprintf(stderr, "No match\n"); - goto cleanup_find_all_manga_slide; - } - for (int i = 0; i < node_set->nodeNr; i++) { - xmlNodePtr node = node_set->nodeTab[i]; - nodes = loop_search_class (node, nodes, "manga-slide", len); - } -cleanup_find_all_manga_slide: - xmlXPathFreeObject (xpath_result); - - return nodes; - -} - -char * -get_attr (xmlNodePtr const node, const char *attr_name) { - char *return_value = NULL; - for (xmlAttr *attr = node->properties; attr; attr=attr->next) { - if (!xmlStrcmp(attr->name, (const xmlChar *) attr_name) - && attr->children && attr->children->content) { - if (!attr->children->content) continue; - size_t content_len = strlen((char *) - attr->children->content); - return_value = alloc_string(content_len); - copy_substring ((char *) attr->children->content, return_value, - content_len, - 0, - content_len); - break; - } - } - return return_value; -} - -xmlNodePtr * -loop_search_class (const xmlNodePtr node, xmlNodePtr *nodes, - const char * class, size_t *len) { - char *content = get_attr (node, "class"); - if (!content) { - return nodes; - } - if (has_class (content, class)) { - (*len)++; - nodes = g_realloc (nodes, (sizeof *nodes) * *len); - nodes[(*len)-1] = xmlCopyNode(node, XML_COPY_NODE_RECURSIVE); - } - g_free (content); - return nodes; -} - -int -has_class (const char *class_attribute, - const char *class_to_check) { - char *re = "\\s+"; - struct SplittedString *classes; - int return_value = 0; - classes = split(re, strlen(re), class_attribute, - strlen(class_attribute)); - for (int i = 0; in_strings; i++) { - if (strcmp(classes->substrings[i].content, class_to_check) == 0) { - return_value = 1; - goto cleanup_has_class; - } - } - -cleanup_has_class: - splitted_string_free (classes); - return return_value; -} - -void -splitted_string_free (struct SplittedString *splitted_string) { - for (int i = 0; in_strings; i++) { - g_free (splitted_string->substrings[i].content); - } - - g_free (splitted_string->substrings); - g_free (splitted_string); -} - -struct SplittedString * -split(char *re_str, size_t re_str_size, const char *subject, size_t subject_size) { - pcre2_code_8 *re; - size_t start_pos = 0; - size_t offset = 0; - int regex_compile_error; - PCRE2_SIZE error_offset; - struct SplittedString *splitted_string; - - splitted_string = g_malloc (sizeof *splitted_string); - - splitted_string->n_strings = 0; - splitted_string->substrings = NULL; - re = pcre2_compile ((PCRE2_SPTR8) re_str, - re_str_size, 0, ®ex_compile_error, &error_offset, NULL); - while (start_pos < subject_size) { - int will_break = 0; - iterate_string_to_split(splitted_string, re, &will_break, - subject, subject_size, &start_pos, &offset); - if (will_break) { - break; - } - } - - pcre2_code_free (re); - re = NULL; - - return splitted_string; -} - -char * -match_1 (char *re_str, char *subject) { - pcre2_code *re; - pcre2_match_data *match_data; - - char *return_value; - int regex_compile_error; - int rc; - size_t len_match = 0; - - return_value = NULL; - PCRE2_SIZE error_offset; - - re = pcre2_compile ((PCRE2_SPTR8) re_str, strlen (re_str), 0, - ®ex_compile_error, &error_offset, NULL); - match_data = pcre2_match_data_create_from_pattern (re, NULL); - rc = pcre2_match (re, (PCRE2_SPTR8) subject, strlen (subject), - 0, 0, match_data, NULL); - if (rc < 0 ) { - goto cleanup_match; - } - - pcre2_substring_get_bynumber (match_data, 1, (PCRE2_UCHAR8**) - &return_value, &len_match); -cleanup_match: - pcre2_match_data_free (match_data); - pcre2_code_free (re); - return return_value; -} - -void -iterate_string_to_split(struct SplittedString *splitted_string, pcre2_code *re, int *will_break, const char *subject, - size_t subject_size, size_t *start_pos, size_t *offset) { - pcre2_match_data_8 *match_data; - PCRE2_SIZE *ovector; - int rc; - - splitted_string->n_strings++; - match_data = pcre2_match_data_create_from_pattern_8 (re, NULL); - rc = pcre2_match ( re, (PCRE2_SPTR8) subject, subject_size, *start_pos, 0, match_data, - NULL); - if (splitted_string->substrings) { - splitted_string->substrings = g_realloc (splitted_string->substrings, - (sizeof *splitted_string->substrings) * (*offset + 1)); - } else { - splitted_string->substrings = g_malloc (sizeof *splitted_string->substrings); - } - if (rc < 0) { - struct String *current_substring = - &splitted_string->substrings [*offset]; - current_substring->content = alloc_string (subject_size - - *start_pos); - copy_substring (subject, current_substring->content, - subject_size, - *start_pos, - subject_size - *start_pos); - current_substring->size = subject_size - *start_pos; - - *will_break = 1; - goto cleanup_iterate_string_to_split; - } - ovector = pcre2_get_ovector_pointer_8(match_data); - splitted_string->substrings[*offset].content = alloc_string ( - ovector[0] - *start_pos); - copy_substring (subject, splitted_string->substrings[*offset] - .content, - subject_size, - *start_pos, - ovector[0] - *start_pos - 1); - splitted_string->substrings[*offset].size = - ovector[0] - *start_pos - 1; - - *start_pos = ovector[1]; - - *offset += 1; - -cleanup_iterate_string_to_split: - pcre2_match_data_free (match_data); -} - -char * -alloc_string(size_t len) { - char * return_value = NULL; - return g_malloc (len + 1 * sizeof *return_value); -} - -void -copy_substring(const char *origin, char *dest, size_t dest_len, size_t start, - size_t len) { - size_t copying_offset = 0; - while (copying_offset < len) { - if (!(start+copying_offset <=dest_len)) { - fprintf(stderr, "Read attempt out of bounds.%ld %ld %ld\n", dest_len, start, len); - break; - } - dest[copying_offset] = origin[start+copying_offset]; - copying_offset++; - } - dest[len] = '\0'; -} - -xmlXPathObjectPtr -get_nodes_xpath_expression (const xmlDocPtr document, char *xpath) { - xmlXPathContextPtr context; - xmlXPathObjectPtr result; - - context = xmlXPathNewContext (document); - result = xmlXPathEvalExpression ((const xmlChar *)xpath, context); - - xmlXPathFreeContext (context); - - return result; -} - diff --git a/readmng.c b/readmng.c new file mode 100644 index 0000000..17a6222 --- /dev/null +++ b/readmng.c @@ -0,0 +1,153 @@ +#include +#include + +const char *readmng_url = "https://www.readmng.com/"; + +struct Manga * +parse_readmng_title_page (const xmlDocPtr html_document, + size_t *const len); +xmlNodePtr +retrieve_slides (const xmlDocPtr html_document); +xmlNodePtr +retrieve_ul_slides (xmlNodePtr const slides); +xmlNodePtr * +retrieve_li_slides (xmlNodePtr const slides, size_t *li_len); +xmlNodePtr +retrieve_img_from_thumnail (xmlNodePtr thumbnail); +xmlNodePtr +retrieve_thumbnail_from_li (xmlNodePtr current_li); +xmlNodePtr +retrieve_title_from_li (xmlNodePtr li); +struct Manga * +extract_manga_info_from_current_li (struct Manga *mangas, + xmlNodePtr current_li, size_t *len); + +struct Manga * +retrieve_readmng_title_mangas (size_t *const len) { + xmlDocPtr html_response; + gsize size_response_text; + struct Manga *mangas; + char *response_text = get_request (readmng_url, + &size_response_text); + html_response = htmlReadMemory (response_text, + size_response_text, + NULL, + NULL, + HTML_PARSE_RECOVER | HTML_PARSE_NODEFDTD + | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING + ); + mangas = parse_readmng_title_page (html_response, len); + xmlFreeDoc (html_response); + g_free (response_text); + return mangas; +} + +struct Manga * +parse_readmng_title_page (const xmlDocPtr html_document, + size_t *const len) { + struct Manga *mangas = NULL; + xmlNodePtr slides = retrieve_slides (html_document); + *len = 0; + size_t li_len = 0; + xmlNodePtr *li = retrieve_li_slides (slides, &li_len); + for (int i = 0; iimage_url = get_attr (img, "src"); + manga->title = (char *) xmlNodeGetContent (title); + } + return mangas; +} + +xmlNodePtr +retrieve_title_from_li (xmlNodePtr li) { + size_t title_len = 0; + xmlNodePtr *title = find_class (li, "title", &title_len, NULL, 1); + if (title_len) return title[0]; + return NULL; +} + +xmlNodePtr +retrieve_img_from_thumnail (xmlNodePtr thumbnail) { + for (xmlNodePtr child = thumbnail->children; child; child=child->next) { + if (!strcmp((char *)child->name, "img")) { + return child; + } + } + return NULL; +} + +xmlNodePtr +retrieve_thumbnail_from_li (xmlNodePtr current_li) { + size_t thumbnail_len = 0; + xmlNodePtr *thumbnail = find_class (current_li, "thumbnail", + &thumbnail_len, NULL, 1); + if (thumbnail_len) return thumbnail[0]; + return NULL; +} + +xmlNodePtr * +retrieve_li_slides (xmlNodePtr const slides, size_t *li_len) { + xmlNodePtr ul_slides = retrieve_ul_slides (slides); + xmlNodePtr *li = NULL; + for (xmlNodePtr child = ul_slides->children; child; child=child->next) { + (*li_len)++; + li = g_realloc(li, sizeof *li * *li_len); + li[*li_len-1] = xmlCopyNode(child, XML_COPY_NODE_RECURSIVE); + } + return li; +} + +xmlNodePtr +retrieve_ul_slides (xmlNodePtr const slides) { + for (xmlNodePtr child = slides->children; child; child = child->next) { + if (!strcmp((char *) child->name, "ul")) { + return child; + } + } + return NULL; +} + +xmlNodePtr +retrieve_slides (const xmlDocPtr html_document) { + xmlNodePtr *nodes = NULL; + xmlXPathObjectPtr xpath_result = NULL; + xpath_result = get_nodes_xpath_expression (html_document, + "//div[@class]"); + xmlNodePtr slides = NULL; + xmlNodeSetPtr node_set = NULL; + size_t matching_classes_len = 0; + + node_set = xpath_result->nodesetval; + if (!node_set) { + fprintf(stderr, "No match\n"); + return NULL; + } + for (int i = 0; i < node_set->nodeNr; i++) { + xmlNodePtr node = node_set->nodeTab[i]; + nodes = loop_search_class (node, nodes, "slides", &matching_classes_len); + } + if (nodes) { + slides = nodes[0]; + } + if (xpath_result) { + xmlXPathFreeObject(xpath_result); + } + return slides; +}