113498266Sopenharmony_ci/*************************************************************************** 213498266Sopenharmony_ci * _ _ ____ _ 313498266Sopenharmony_ci * Project ___| | | | _ \| | 413498266Sopenharmony_ci * / __| | | | |_) | | 513498266Sopenharmony_ci * | (__| |_| | _ <| |___ 613498266Sopenharmony_ci * \___|\___/|_| \_\_____| 713498266Sopenharmony_ci * 813498266Sopenharmony_ci * Copyright (C) Jeroen Ooms <jeroenooms@gmail.com> 913498266Sopenharmony_ci * 1013498266Sopenharmony_ci * This software is licensed as described in the file COPYING, which 1113498266Sopenharmony_ci * you should have received as part of this distribution. The terms 1213498266Sopenharmony_ci * are also available at https://curl.se/docs/copyright.html. 1313498266Sopenharmony_ci * 1413498266Sopenharmony_ci * You may opt to use, copy, modify, merge, publish, distribute and/or sell 1513498266Sopenharmony_ci * copies of the Software, and permit persons to whom the Software is 1613498266Sopenharmony_ci * furnished to do so, under the terms of the COPYING file. 1713498266Sopenharmony_ci * 1813498266Sopenharmony_ci * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY 1913498266Sopenharmony_ci * KIND, either express or implied. 2013498266Sopenharmony_ci * 2113498266Sopenharmony_ci * SPDX-License-Identifier: curl 2213498266Sopenharmony_ci * 2313498266Sopenharmony_ci * To compile: 2413498266Sopenharmony_ci * gcc crawler.c $(pkg-config --cflags --libs libxml-2.0 libcurl) 2513498266Sopenharmony_ci * 2613498266Sopenharmony_ci */ 2713498266Sopenharmony_ci/* <DESC> 2813498266Sopenharmony_ci * Web crawler based on curl and libxml2 to stress-test curl with 2913498266Sopenharmony_ci * hundreds of concurrent connections to various servers. 3013498266Sopenharmony_ci * </DESC> 3113498266Sopenharmony_ci */ 3213498266Sopenharmony_ci 3313498266Sopenharmony_ci/* Parameters */ 3413498266Sopenharmony_ciint max_con = 200; 3513498266Sopenharmony_ciint max_total = 20000; 3613498266Sopenharmony_ciint max_requests = 500; 3713498266Sopenharmony_ciint max_link_per_page = 5; 3813498266Sopenharmony_ciint follow_relative_links = 0; 3913498266Sopenharmony_cichar *start_page = "https://www.reuters.com"; 4013498266Sopenharmony_ci 4113498266Sopenharmony_ci#include <libxml/HTMLparser.h> 4213498266Sopenharmony_ci#include <libxml/xpath.h> 4313498266Sopenharmony_ci#include <libxml/uri.h> 4413498266Sopenharmony_ci#include <curl/curl.h> 4513498266Sopenharmony_ci#include <stdlib.h> 4613498266Sopenharmony_ci#include <string.h> 4713498266Sopenharmony_ci#include <math.h> 4813498266Sopenharmony_ci#include <signal.h> 4913498266Sopenharmony_ci 5013498266Sopenharmony_ciint pending_interrupt = 0; 5113498266Sopenharmony_civoid sighandler(int dummy) 5213498266Sopenharmony_ci{ 5313498266Sopenharmony_ci pending_interrupt = 1; 5413498266Sopenharmony_ci} 5513498266Sopenharmony_ci 5613498266Sopenharmony_ci/* resizable buffer */ 5713498266Sopenharmony_citypedef struct { 5813498266Sopenharmony_ci char *buf; 5913498266Sopenharmony_ci size_t size; 6013498266Sopenharmony_ci} memory; 6113498266Sopenharmony_ci 6213498266Sopenharmony_cisize_t grow_buffer(void *contents, size_t sz, size_t nmemb, void *ctx) 6313498266Sopenharmony_ci{ 6413498266Sopenharmony_ci size_t realsize = sz * nmemb; 6513498266Sopenharmony_ci memory *mem = (memory*) ctx; 6613498266Sopenharmony_ci char *ptr = realloc(mem->buf, mem->size + realsize); 6713498266Sopenharmony_ci if(!ptr) { 6813498266Sopenharmony_ci /* out of memory */ 6913498266Sopenharmony_ci printf("not enough memory (realloc returned NULL)\n"); 7013498266Sopenharmony_ci return 0; 7113498266Sopenharmony_ci } 7213498266Sopenharmony_ci mem->buf = ptr; 7313498266Sopenharmony_ci memcpy(&(mem->buf[mem->size]), contents, realsize); 7413498266Sopenharmony_ci mem->size += realsize; 7513498266Sopenharmony_ci return realsize; 7613498266Sopenharmony_ci} 7713498266Sopenharmony_ci 7813498266Sopenharmony_ciCURL *make_handle(char *url) 7913498266Sopenharmony_ci{ 8013498266Sopenharmony_ci CURL *handle = curl_easy_init(); 8113498266Sopenharmony_ci 8213498266Sopenharmony_ci /* Important: use HTTP2 over HTTPS */ 8313498266Sopenharmony_ci curl_easy_setopt(handle, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2TLS); 8413498266Sopenharmony_ci curl_easy_setopt(handle, CURLOPT_URL, url); 8513498266Sopenharmony_ci 8613498266Sopenharmony_ci /* buffer body */ 8713498266Sopenharmony_ci memory *mem = malloc(sizeof(memory)); 8813498266Sopenharmony_ci mem->size = 0; 8913498266Sopenharmony_ci mem->buf = malloc(1); 9013498266Sopenharmony_ci curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, grow_buffer); 9113498266Sopenharmony_ci curl_easy_setopt(handle, CURLOPT_WRITEDATA, mem); 9213498266Sopenharmony_ci curl_easy_setopt(handle, CURLOPT_PRIVATE, mem); 9313498266Sopenharmony_ci 9413498266Sopenharmony_ci /* For completeness */ 9513498266Sopenharmony_ci curl_easy_setopt(handle, CURLOPT_ACCEPT_ENCODING, ""); 9613498266Sopenharmony_ci curl_easy_setopt(handle, CURLOPT_TIMEOUT, 5L); 9713498266Sopenharmony_ci curl_easy_setopt(handle, CURLOPT_FOLLOWLOCATION, 1L); 9813498266Sopenharmony_ci /* only allow redirects to HTTP and HTTPS URLs */ 9913498266Sopenharmony_ci curl_easy_setopt(handle, CURLOPT_REDIR_PROTOCOLS_STR, "http,https"); 10013498266Sopenharmony_ci curl_easy_setopt(handle, CURLOPT_AUTOREFERER, 1L); 10113498266Sopenharmony_ci curl_easy_setopt(handle, CURLOPT_MAXREDIRS, 10L); 10213498266Sopenharmony_ci /* each transfer needs to be done within 20 seconds! */ 10313498266Sopenharmony_ci curl_easy_setopt(handle, CURLOPT_TIMEOUT_MS, 20000L); 10413498266Sopenharmony_ci /* connect fast or fail */ 10513498266Sopenharmony_ci curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT_MS, 2000L); 10613498266Sopenharmony_ci /* skip files larger than a gigabyte */ 10713498266Sopenharmony_ci curl_easy_setopt(handle, CURLOPT_MAXFILESIZE_LARGE, 10813498266Sopenharmony_ci (curl_off_t)1024*1024*1024); 10913498266Sopenharmony_ci curl_easy_setopt(handle, CURLOPT_COOKIEFILE, ""); 11013498266Sopenharmony_ci curl_easy_setopt(handle, CURLOPT_FILETIME, 1L); 11113498266Sopenharmony_ci curl_easy_setopt(handle, CURLOPT_USERAGENT, "mini crawler"); 11213498266Sopenharmony_ci curl_easy_setopt(handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY); 11313498266Sopenharmony_ci curl_easy_setopt(handle, CURLOPT_UNRESTRICTED_AUTH, 1L); 11413498266Sopenharmony_ci curl_easy_setopt(handle, CURLOPT_PROXYAUTH, CURLAUTH_ANY); 11513498266Sopenharmony_ci curl_easy_setopt(handle, CURLOPT_EXPECT_100_TIMEOUT_MS, 0L); 11613498266Sopenharmony_ci return handle; 11713498266Sopenharmony_ci} 11813498266Sopenharmony_ci 11913498266Sopenharmony_ci/* HREF finder implemented in libxml2 but could be any HTML parser */ 12013498266Sopenharmony_cisize_t follow_links(CURLM *multi_handle, memory *mem, char *url) 12113498266Sopenharmony_ci{ 12213498266Sopenharmony_ci int opts = HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | \ 12313498266Sopenharmony_ci HTML_PARSE_NOWARNING | HTML_PARSE_NONET; 12413498266Sopenharmony_ci htmlDocPtr doc = htmlReadMemory(mem->buf, mem->size, url, NULL, opts); 12513498266Sopenharmony_ci if(!doc) 12613498266Sopenharmony_ci return 0; 12713498266Sopenharmony_ci xmlChar *xpath = (xmlChar*) "//a/@href"; 12813498266Sopenharmony_ci xmlXPathContextPtr context = xmlXPathNewContext(doc); 12913498266Sopenharmony_ci xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, context); 13013498266Sopenharmony_ci xmlXPathFreeContext(context); 13113498266Sopenharmony_ci if(!result) 13213498266Sopenharmony_ci return 0; 13313498266Sopenharmony_ci xmlNodeSetPtr nodeset = result->nodesetval; 13413498266Sopenharmony_ci if(xmlXPathNodeSetIsEmpty(nodeset)) { 13513498266Sopenharmony_ci xmlXPathFreeObject(result); 13613498266Sopenharmony_ci return 0; 13713498266Sopenharmony_ci } 13813498266Sopenharmony_ci size_t count = 0; 13913498266Sopenharmony_ci int i; 14013498266Sopenharmony_ci for(i = 0; i < nodeset->nodeNr; i++) { 14113498266Sopenharmony_ci double r = rand(); 14213498266Sopenharmony_ci int x = r * nodeset->nodeNr / RAND_MAX; 14313498266Sopenharmony_ci const xmlNode *node = nodeset->nodeTab[x]->xmlChildrenNode; 14413498266Sopenharmony_ci xmlChar *href = xmlNodeListGetString(doc, node, 1); 14513498266Sopenharmony_ci if(follow_relative_links) { 14613498266Sopenharmony_ci xmlChar *orig = href; 14713498266Sopenharmony_ci href = xmlBuildURI(href, (xmlChar *) url); 14813498266Sopenharmony_ci xmlFree(orig); 14913498266Sopenharmony_ci } 15013498266Sopenharmony_ci char *link = (char *) href; 15113498266Sopenharmony_ci if(!link || strlen(link) < 20) 15213498266Sopenharmony_ci continue; 15313498266Sopenharmony_ci if(!strncmp(link, "http://", 7) || !strncmp(link, "https://", 8)) { 15413498266Sopenharmony_ci curl_multi_add_handle(multi_handle, make_handle(link)); 15513498266Sopenharmony_ci if(count++ == max_link_per_page) 15613498266Sopenharmony_ci break; 15713498266Sopenharmony_ci } 15813498266Sopenharmony_ci xmlFree(link); 15913498266Sopenharmony_ci } 16013498266Sopenharmony_ci xmlXPathFreeObject(result); 16113498266Sopenharmony_ci return count; 16213498266Sopenharmony_ci} 16313498266Sopenharmony_ci 16413498266Sopenharmony_ciint is_html(char *ctype) 16513498266Sopenharmony_ci{ 16613498266Sopenharmony_ci return ctype != NULL && strlen(ctype) > 10 && strstr(ctype, "text/html"); 16713498266Sopenharmony_ci} 16813498266Sopenharmony_ci 16913498266Sopenharmony_ciint main(void) 17013498266Sopenharmony_ci{ 17113498266Sopenharmony_ci signal(SIGINT, sighandler); 17213498266Sopenharmony_ci LIBXML_TEST_VERSION; 17313498266Sopenharmony_ci curl_global_init(CURL_GLOBAL_DEFAULT); 17413498266Sopenharmony_ci CURLM *multi_handle = curl_multi_init(); 17513498266Sopenharmony_ci curl_multi_setopt(multi_handle, CURLMOPT_MAX_TOTAL_CONNECTIONS, max_con); 17613498266Sopenharmony_ci curl_multi_setopt(multi_handle, CURLMOPT_MAX_HOST_CONNECTIONS, 6L); 17713498266Sopenharmony_ci 17813498266Sopenharmony_ci /* enables http/2 if available */ 17913498266Sopenharmony_ci#ifdef CURLPIPE_MULTIPLEX 18013498266Sopenharmony_ci curl_multi_setopt(multi_handle, CURLMOPT_PIPELINING, CURLPIPE_MULTIPLEX); 18113498266Sopenharmony_ci#endif 18213498266Sopenharmony_ci 18313498266Sopenharmony_ci /* sets html start page */ 18413498266Sopenharmony_ci curl_multi_add_handle(multi_handle, make_handle(start_page)); 18513498266Sopenharmony_ci 18613498266Sopenharmony_ci int msgs_left; 18713498266Sopenharmony_ci int pending = 0; 18813498266Sopenharmony_ci int complete = 0; 18913498266Sopenharmony_ci int still_running = 1; 19013498266Sopenharmony_ci while(still_running && !pending_interrupt) { 19113498266Sopenharmony_ci int numfds; 19213498266Sopenharmony_ci curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds); 19313498266Sopenharmony_ci curl_multi_perform(multi_handle, &still_running); 19413498266Sopenharmony_ci 19513498266Sopenharmony_ci /* See how the transfers went */ 19613498266Sopenharmony_ci CURLMsg *m = NULL; 19713498266Sopenharmony_ci while((m = curl_multi_info_read(multi_handle, &msgs_left))) { 19813498266Sopenharmony_ci if(m->msg == CURLMSG_DONE) { 19913498266Sopenharmony_ci CURL *handle = m->easy_handle; 20013498266Sopenharmony_ci char *url; 20113498266Sopenharmony_ci memory *mem; 20213498266Sopenharmony_ci curl_easy_getinfo(handle, CURLINFO_PRIVATE, &mem); 20313498266Sopenharmony_ci curl_easy_getinfo(handle, CURLINFO_EFFECTIVE_URL, &url); 20413498266Sopenharmony_ci if(m->data.result == CURLE_OK) { 20513498266Sopenharmony_ci long res_status; 20613498266Sopenharmony_ci curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &res_status); 20713498266Sopenharmony_ci if(res_status == 200) { 20813498266Sopenharmony_ci char *ctype; 20913498266Sopenharmony_ci curl_easy_getinfo(handle, CURLINFO_CONTENT_TYPE, &ctype); 21013498266Sopenharmony_ci printf("[%d] HTTP 200 (%s): %s\n", complete, ctype, url); 21113498266Sopenharmony_ci if(is_html(ctype) && mem->size > 100) { 21213498266Sopenharmony_ci if(pending < max_requests && (complete + pending) < max_total) { 21313498266Sopenharmony_ci pending += follow_links(multi_handle, mem, url); 21413498266Sopenharmony_ci still_running = 1; 21513498266Sopenharmony_ci } 21613498266Sopenharmony_ci } 21713498266Sopenharmony_ci } 21813498266Sopenharmony_ci else { 21913498266Sopenharmony_ci printf("[%d] HTTP %d: %s\n", complete, (int) res_status, url); 22013498266Sopenharmony_ci } 22113498266Sopenharmony_ci } 22213498266Sopenharmony_ci else { 22313498266Sopenharmony_ci printf("[%d] Connection failure: %s\n", complete, url); 22413498266Sopenharmony_ci } 22513498266Sopenharmony_ci curl_multi_remove_handle(multi_handle, handle); 22613498266Sopenharmony_ci curl_easy_cleanup(handle); 22713498266Sopenharmony_ci free(mem->buf); 22813498266Sopenharmony_ci free(mem); 22913498266Sopenharmony_ci complete++; 23013498266Sopenharmony_ci pending--; 23113498266Sopenharmony_ci } 23213498266Sopenharmony_ci } 23313498266Sopenharmony_ci } 23413498266Sopenharmony_ci curl_multi_cleanup(multi_handle); 23513498266Sopenharmony_ci curl_global_cleanup(); 23613498266Sopenharmony_ci return 0; 23713498266Sopenharmony_ci} 238