Refactoring a little to make the code easier to change.

This commit is contained in:
sergiotarxz 2021-10-22 22:03:05 +02:00
parent dab00a460f
commit 6dd5f63428
1 changed files with 229 additions and 183 deletions

View File

@ -11,278 +11,324 @@
#include <manga.h> #include <manga.h>
#define XML_COPY_NODE_RECURSIVE 2 | 1
const char *mangafox_url = const char *mangafox_url =
"https://mangafox.fun"; "https://mangafox.fun";
struct String { struct String {
char *content; char *content;
size_t size; size_t size;
}; };
struct SplittedString { struct SplittedString {
struct String *substrings; struct String *substrings;
size_t n_strings; size_t n_strings;
}; };
struct Manga * struct Manga *
parse_main_mangafox_page ( parse_main_mangafox_page (
const xmlDocPtr html_document, const xmlDocPtr html_document,
const size_t *size); const size_t *size);
xmlXPathObjectPtr xmlXPathObjectPtr
get_nodes_xpath_expression ( get_nodes_xpath_expression (
const xmlDocPtr document, const xmlDocPtr document,
char *xpath); char *xpath);
char * char *
alloc_string(size_t len); alloc_string(size_t len);
void void
copy_substring(const char *origin, char *dest, copy_substring(const char *origin, char *dest,
size_t dest_len, size_t start, size_t len); size_t dest_len, size_t start, size_t len);
void void
print_classes (const char *class_attribute, print_classes (const char *class_attribute,
size_t class_attribute_size); size_t class_attribute_size);
int int
has_class (const char *class_attribute, has_class (const char *class_attribute,
char *class_to_check); const char *class_to_check);
void void
splitted_string_free (struct SplittedString *splitted_string); splitted_string_free (struct SplittedString *splitted_string);
struct SplittedString * struct SplittedString *
split(char *re_str, size_t re_str_size, const char *subject, split(char *re_str, size_t re_str_size, const char *subject,
size_t subject_size); size_t subject_size);
void void
iterate_string_to_split(struct SplittedString *splitted_string, pcre2_code *re, int *will_break, const char *subject, iterate_string_to_split(struct SplittedString *splitted_string, pcre2_code *re, int *will_break, const char *subject,
size_t subject_size, size_t *start_pos, size_t *offset); size_t subject_size, size_t *start_pos, size_t *offset);
char * char *
get_request (const char *url, gsize *size_response_text); get_request (const char *url, gsize *size_response_text);
xmlNodePtr *
loop_search_class(const xmlNodePtr node, xmlNodePtr *nodes,
const char * class, size_t *len);
void
print_debug_nodes (const xmlDocPtr html_document,
xmlNodePtr *nodes, size_t nodes_len);
xmlNodePtr *
find_all_manga_slide(const xmlDocPtr html_document,
size_t *len);
void void
retrieve_mangafox_title () { retrieve_mangafox_title () {
xmlDocPtr html_response; xmlDocPtr html_response;
gsize size_response_text; gsize size_response_text;
char *response_text = get_request (mangafox_url, char *response_text = get_request (mangafox_url,
&size_response_text); &size_response_text);
html_response = htmlReadMemory (response_text, html_response = htmlReadMemory (response_text,
size_response_text, size_response_text,
NULL, NULL,
NULL, NULL,
HTML_PARSE_RECOVER | HTML_PARSE_NODEFDTD HTML_PARSE_RECOVER | HTML_PARSE_NODEFDTD
| HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING
); );
size_t manga_size; size_t manga_size;
parse_main_mangafox_page (html_response, &manga_size); parse_main_mangafox_page (html_response, &manga_size);
xmlFreeDoc (html_response); xmlFreeDoc (html_response);
free (response_text); free (response_text);
} }
char * char *
get_request (const char *url, gsize *size_response_text) { get_request (const char *url, gsize *size_response_text) {
SoupSession *soup_session; SoupSession *soup_session;
SoupMessage *msg; SoupMessage *msg;
GValue response = G_VALUE_INIT; GValue response = G_VALUE_INIT;
guint status; guint status;
*size_response_text = 0; *size_response_text = 0;
g_value_init (&response, G_TYPE_BYTES);
soup_session = soup_session_new (); g_value_init (&response, G_TYPE_BYTES);
msg = soup_message_new ("GET", url);
status = soup_session_send_message (soup_session, msg);
g_object_get_property(
G_OBJECT (msg),
"response-body-data",
&response);
printf("%u\n", status); soup_session = soup_session_new ();
const char *html_response = g_bytes_get_data ((GBytes *) msg = soup_message_new ("GET", url);
g_value_peek_pointer (&response), status = soup_session_send_message (soup_session, msg);
size_response_text); g_object_get_property(
G_OBJECT (msg),
"response-body-data",
&response);
char *return_value = g_strndup (html_response, *size_response_text); printf("%u\n", status);
const char *html_response = g_bytes_get_data ((GBytes *)
g_value_peek_pointer (&response),
size_response_text);
g_value_unset (&response); char *return_value = g_strndup (html_response, *size_response_text);
g_object_unref (soup_session);
g_object_unref (msg);
return return_value; g_value_unset (&response);
g_object_unref (soup_session);
g_object_unref (msg);
return return_value;
} }
struct Manga * struct Manga *
parse_main_mangafox_page (const xmlDocPtr html_document, parse_main_mangafox_page (const xmlDocPtr html_document,
const size_t *size) { const size_t *size) {
xmlIndentTreeOutput = 1; xmlNodePtr *nodes;
xmlXPathObjectPtr xpath_result = get_nodes_xpath_expression (html_document, size_t nodes_len = 0;
"//div[@class]");
if (!xpath_result) {
fprintf(stderr, "Empty xpath result\n");
return NULL;
}
xmlNodeSetPtr node_set = xpath_result->nodesetval;
if (!node_set) {
fprintf(stderr, "No match\n");
return NULL;
}
for (int i = 0; i < node_set->nodeNr; i++) {
xmlNodePtr node = node_set->nodeTab[i];
for (xmlAttr *attrs = node->properties; attrs; attrs=attrs->next) {
if (!xmlStrcmp(attrs->name, (const xmlChar *)"class")
&& attrs->children && attrs->children->content) {
const char *content = (char *) attrs->children->content;
if (has_class (content, "manga-slide")) {
printf("%s\n", content);
}
}
}
}
xmlXPathFreeObject (xpath_result); nodes = find_all_manga_slide (html_document, &nodes_len);
print_debug_nodes (html_document, nodes, nodes_len);
}
void
print_debug_nodes (const xmlDocPtr html_document,
xmlNodePtr *nodes, size_t nodes_len) {
xmlBufferPtr buffer = xmlBufferCreate ();
for (int i = 0; i < nodes_len; i++) {
xmlNodeDump (buffer, html_document, nodes[i],
0, 1);
}
xmlBufferDump (stdout, buffer);
xmlBufferFree (buffer);
}
xmlNodePtr *
find_all_manga_slide(const xmlDocPtr html_document,
size_t *len) {
xmlNodeSetPtr node_set;
xmlNodePtr *nodes;
xmlXPathObjectPtr xpath_result;
node_set = NULL;
nodes = NULL;
xpath_result = get_nodes_xpath_expression (html_document,
"//div[@class]");
if (!xpath_result) {
fprintf(stderr, "Empty xpath result\n");
goto cleanup_find_all_manga_slide;
}
node_set = xpath_result->nodesetval;
if (!node_set) {
fprintf(stderr, "No match\n");
goto cleanup_find_all_manga_slide;
}
for (int i = 0; i < node_set->nodeNr; i++) {
xmlNodePtr node = node_set->nodeTab[i];
nodes = loop_search_class (node, nodes, "manga-slide", len);
}
cleanup_find_all_manga_slide:
xmlXPathFreeObject (xpath_result);
return nodes;
}
xmlNodePtr *
loop_search_class(const xmlNodePtr node, xmlNodePtr *nodes,
const char * class, size_t *len) {
for (xmlAttr *attr = node->properties; attr; attr=attr->next) {
if (!xmlStrcmp(attr->name, (const xmlChar *)"class")
&& attr->children && attr->children->content) {
const char *content = (char *) attr->children->content;
if (has_class (content, class)) {
(*len)++;
nodes = g_realloc (nodes, (sizeof *nodes) * *len);
nodes[(*len)-1] = xmlCopyNode(node, XML_COPY_NODE_RECURSIVE);
}
}
}
return nodes;
} }
int int
has_class (const char *class_attribute, has_class (const char *class_attribute,
char *class_to_check) { const char *class_to_check) {
char *re = "\\s+"; char *re = "\\s+";
struct SplittedString *classes; struct SplittedString *classes;
int return_value = 0; int return_value = 0;
classes = split(re, strlen(re), class_attribute, classes = split(re, strlen(re), class_attribute,
strlen(class_attribute)); strlen(class_attribute));
for (int i = 0; i<classes->n_strings; i++) { for (int i = 0; i<classes->n_strings; i++) {
if (strcmp(classes->substrings[i].content, class_to_check) == 0) { if (strcmp(classes->substrings[i].content, class_to_check) == 0) {
return_value = 1; return_value = 1;
goto cleanup_has_class; goto cleanup_has_class;
} }
} }
cleanup_has_class: cleanup_has_class:
splitted_string_free (classes); splitted_string_free (classes);
return return_value; return return_value;
} }
void void
splitted_string_free (struct SplittedString *splitted_string) { splitted_string_free (struct SplittedString *splitted_string) {
for (int i = 0; i<splitted_string->n_strings; i++) { for (int i = 0; i<splitted_string->n_strings; i++) {
g_free (splitted_string->substrings[i].content); g_free (splitted_string->substrings[i].content);
} }
g_free (splitted_string->substrings); g_free (splitted_string->substrings);
g_free (splitted_string); g_free (splitted_string);
} }
struct SplittedString * struct SplittedString *
split(char *re_str, size_t re_str_size, const char *subject, size_t subject_size) { split(char *re_str, size_t re_str_size, const char *subject, size_t subject_size) {
pcre2_code_8 *re; pcre2_code_8 *re;
size_t start_pos = 0; size_t start_pos = 0;
size_t offset = 0; size_t offset = 0;
int regex_compile_error; int regex_compile_error;
PCRE2_SIZE error_offset; PCRE2_SIZE error_offset;
struct SplittedString *splitted_string; struct SplittedString *splitted_string;
splitted_string = g_malloc (sizeof *splitted_string); splitted_string = g_malloc (sizeof *splitted_string);
splitted_string->n_strings = 0; splitted_string->n_strings = 0;
splitted_string->substrings = NULL; splitted_string->substrings = NULL;
re = pcre2_compile ((PCRE2_SPTR8) re_str, re = pcre2_compile ((PCRE2_SPTR8) re_str,
re_str_size, 0, &regex_compile_error, &error_offset, NULL); re_str_size, 0, &regex_compile_error, &error_offset, NULL);
while (start_pos < subject_size) { while (start_pos < subject_size) {
int will_break = 0; int will_break = 0;
iterate_string_to_split(splitted_string, re, &will_break, iterate_string_to_split(splitted_string, re, &will_break,
subject, subject_size, &start_pos, &offset); subject, subject_size, &start_pos, &offset);
if (will_break) { if (will_break) {
break; break;
} }
} }
pcre2_code_free (re); pcre2_code_free (re);
re = NULL; re = NULL;
return splitted_string; return splitted_string;
} }
void void
iterate_string_to_split(struct SplittedString *splitted_string, pcre2_code *re, int *will_break, const char *subject, iterate_string_to_split(struct SplittedString *splitted_string, pcre2_code *re, int *will_break, const char *subject,
size_t subject_size, size_t *start_pos, size_t *offset) { size_t subject_size, size_t *start_pos, size_t *offset) {
pcre2_match_data_8 *match_data; pcre2_match_data_8 *match_data;
PCRE2_SIZE *ovector; PCRE2_SIZE *ovector;
int rc; int rc;
splitted_string->n_strings++; splitted_string->n_strings++;
match_data = pcre2_match_data_create_from_pattern_8 (re, NULL); match_data = pcre2_match_data_create_from_pattern_8 (re, NULL);
rc = pcre2_match_8 ( re, (PCRE2_SPTR8) subject, subject_size, *start_pos, 0, match_data, rc = pcre2_match_8 ( re, (PCRE2_SPTR8) subject, subject_size, *start_pos, 0, match_data,
NULL); NULL);
if (splitted_string->substrings) { if (splitted_string->substrings) {
splitted_string->substrings = g_realloc (splitted_string->substrings, splitted_string->substrings = g_realloc (splitted_string->substrings,
(sizeof *splitted_string->substrings) * (*offset + 1)); (sizeof *splitted_string->substrings) * (*offset + 1));
} else { } else {
splitted_string->substrings = g_malloc (sizeof *splitted_string->substrings); splitted_string->substrings = g_malloc (sizeof *splitted_string->substrings);
} }
if (rc < 0) { if (rc < 0) {
struct String *current_substring = struct String *current_substring =
&splitted_string->substrings [*offset]; &splitted_string->substrings [*offset];
current_substring->content = alloc_string (subject_size current_substring->content = alloc_string (subject_size
- *start_pos); - *start_pos);
copy_substring (subject, current_substring->content, copy_substring (subject, current_substring->content,
subject_size, subject_size,
*start_pos, *start_pos,
subject_size - *start_pos); subject_size - *start_pos);
current_substring->size = subject_size - *start_pos; current_substring->size = subject_size - *start_pos;
*will_break = 1; *will_break = 1;
goto cleanup_iterate_string_to_split; goto cleanup_iterate_string_to_split;
} }
ovector = pcre2_get_ovector_pointer_8(match_data); ovector = pcre2_get_ovector_pointer_8(match_data);
splitted_string->substrings[*offset].content = alloc_string ( splitted_string->substrings[*offset].content = alloc_string (
ovector[0] - *start_pos); ovector[0] - *start_pos);
copy_substring (subject, splitted_string->substrings[*offset] copy_substring (subject, splitted_string->substrings[*offset]
.content, .content,
subject_size, subject_size,
*start_pos, *start_pos,
ovector[0] - *start_pos - 1); ovector[0] - *start_pos - 1);
splitted_string->substrings[*offset].size = splitted_string->substrings[*offset].size =
ovector[0] - *start_pos - 1; ovector[0] - *start_pos - 1;
*start_pos = ovector[1]; *start_pos = ovector[1];
*offset += 1; *offset += 1;
cleanup_iterate_string_to_split: cleanup_iterate_string_to_split:
pcre2_match_data_free (match_data); pcre2_match_data_free (match_data);
} }
char * char *
alloc_string(size_t len) { alloc_string(size_t len) {
char * return_value; char * return_value;
return g_malloc (len + 1 * sizeof *return_value); return g_malloc (len + 1 * sizeof *return_value);
} }
void void
copy_substring(const char *origin, char *dest, size_t dest_len, size_t start, copy_substring(const char *origin, char *dest, size_t dest_len, size_t start,
size_t len) { size_t len) {
size_t copying_offset = 0; size_t copying_offset = 0;
while (copying_offset < len) { while (copying_offset < len) {
if (!(start+copying_offset <=dest_len)) { if (!(start+copying_offset <=dest_len)) {
fprintf(stderr, "Read attempt out of bounds.%ld %ld %ld\n", dest_len, start, len); fprintf(stderr, "Read attempt out of bounds.%ld %ld %ld\n", dest_len, start, len);
break; break;
} }
dest[copying_offset] = origin[start+copying_offset]; dest[copying_offset] = origin[start+copying_offset];
copying_offset++; copying_offset++;
} }
dest[len] = '\0'; dest[len] = '\0';
} }
xmlXPathObjectPtr xmlXPathObjectPtr
get_nodes_xpath_expression (const xmlDocPtr document, char *xpath) { get_nodes_xpath_expression (const xmlDocPtr document, char *xpath) {
xmlXPathContextPtr context; xmlXPathContextPtr context;
xmlXPathObjectPtr result; xmlXPathObjectPtr result;
context = xmlXPathNewContext (document); context = xmlXPathNewContext (document);
if (!context) { result = xmlXPathEvalExpression ((const xmlChar *)xpath, context);
fprintf(stderr, "Error in xmlXpathNewContext\n");
return NULL;
}
result = xmlXPathEvalExpression ((const xmlChar *)xpath, context);
xmlXPathFreeContext (context); xmlXPathFreeContext (context);
return result; return result;
} }