Making some fixes and searching for divs.

Next step is pcre2 integration.
This commit is contained in:
sergiotarxz 2021-10-17 19:41:52 +02:00
parent 9591489cd9
commit 595eda2215

View File

@ -15,40 +15,16 @@ xmlXPathObjectPtr
get_nodes_xpath_expression ( get_nodes_xpath_expression (
const xmlDocPtr document, const xmlDocPtr document,
char *xpath); char *xpath);
char *
get_request (const char *url, gsize *size_response_text);
void void
retrieve_mangafox_title () { retrieve_mangafox_title () {
SoupSession
*soup_session;
SoupMessage *msg;
GValue response = G_VALUE_INIT;
guint status;
gsize size_response_text;
xmlDocPtr html_response; xmlDocPtr html_response;
gsize *size_response_text = malloc (sizeof (gsize));
g_value_init (&response, G_TYPE_BYTES); char *response_text = get_request (mangafox_url,
size_response_text);
soup_session =
soup_session_new();
msg =
soup_message_new(
"GET",
mangafox_url
);
status =
soup_session_send_message (soup_session, msg);
g_object_get_property(
G_OBJECT (msg),
"response-body-data",
&response);
const char *response_text =
g_bytes_get_data (
(GBytes *)
g_value_peek_pointer
(&response),
&size_response_text
);
html_response = htmlReadMemory (response_text, html_response = htmlReadMemory (response_text,
size_response_text, *size_response_text,
NULL, NULL,
NULL, NULL,
HTML_PARSE_RECOVER | HTML_PARSE_NODEFDTD HTML_PARSE_RECOVER | HTML_PARSE_NODEFDTD
@ -56,29 +32,67 @@ retrieve_mangafox_title() {
); );
size_t manga_size; size_t manga_size;
parse_main_mangafox_page (html_response, &manga_size); parse_main_mangafox_page (html_response, &manga_size);
free (response_text);
}
char *
get_request (const char *url, gsize *size_response_text) {
SoupSession *soup_session;
SoupMessage *msg;
GValue response = G_VALUE_INIT;
guint status;
*size_response_text = 0;
g_value_init (&response, G_TYPE_BYTES);
soup_session = soup_session_new ();
msg = soup_message_new ("GET", url);
status = soup_session_send_message (soup_session, msg);
g_object_get_property(
G_OBJECT (msg),
"response-body-data",
&response);
printf("%u\n", status); printf("%u\n", status);
const char *html_response = g_bytes_get_data ((GBytes *)
g_value_peek_pointer (&response),
size_response_text);
char *return_value = g_strndup (html_response, *size_response_text);
g_value_unset (&response);
g_object_unref (soup_session);
g_object_unref (msg);
return return_value;
} }
struct Manga * struct Manga *
parse_main_mangafox_page ( parse_main_mangafox_page (const xmlDocPtr html_document,
const xmlDocPtr html_document,
const size_t *size) { const size_t *size) {
xmlIndentTreeOutput = 1; xmlIndentTreeOutput = 1;
// xmlDocDump (stderr, html_document);
xmlXPathObjectPtr xpath_result = get_nodes_xpath_expression (html_document, xmlXPathObjectPtr xpath_result = get_nodes_xpath_expression (html_document,
"//a"); "//div[@class]");
if (!xpath_result) { if (!xpath_result) {
fprintf(stderr, "Empty xpath result\n"); fprintf(stderr, "Empty xpath result\n");
return NULL; return NULL;
} }
xmlNodeSetPtr node_set = xpath_result->nodesetval; xmlNodeSetPtr node_set = xpath_result->nodesetval;
printf("%d\n", node_set->nodeNr); if (!node_set) {
fprintf(stderr, "No match\n");
return NULL;
}
for (int i = 0; i < node_set->nodeNr; i++) { for (int i = 0; i < node_set->nodeNr; i++) {
xmlNodePtr node = node_set->nodeTab[i]; xmlNodePtr node = node_set->nodeTab[i];
for (xmlAttr *attrs = node->properties; attrs->next; attrs=attrs->next) { for (xmlAttr *attrs = node->properties; attrs; attrs=attrs->next) {
if (!xmlStrcmp(attrs->name, (const xmlChar *)"href")) { if (!xmlStrcmp(attrs->name, (const xmlChar *)"class")) {
if (attrs->children
&& attrs->children->content) {
printf("%s\n", (const char *)attrs->children->content); printf("%s\n", (const char *)attrs->children->content);
break;
}
} }
} }
@ -86,9 +100,7 @@ parse_main_mangafox_page (
} }
xmlXPathObjectPtr xmlXPathObjectPtr
get_nodes_xpath_expression ( get_nodes_xpath_expression (const xmlDocPtr document, char *xpath) {
const xmlDocPtr document,
char *xpath) {
xmlXPathContextPtr context; xmlXPathContextPtr context;
xmlXPathObjectPtr result; xmlXPathObjectPtr result;