1/*************************************************************************** 2 * _ _ ____ _ 3 * Project ___| | | | _ \| | 4 * / __| | | | |_) | | 5 * | (__| |_| | _ <| |___ 6 * \___|\___/|_| \_\_____| 7 * 8 * Copyright (C) Jeroen Ooms <jeroenooms@gmail.com> 9 * 10 * This software is licensed as described in the file COPYING, which 11 * you should have received as part of this distribution. The terms 12 * are also available at https://curl.se/docs/copyright.html. 13 * 14 * You may opt to use, copy, modify, merge, publish, distribute and/or sell 15 * copies of the Software, and permit persons to whom the Software is 16 * furnished to do so, under the terms of the COPYING file. 17 * 18 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY 19 * KIND, either express or implied. 20 * 21 * SPDX-License-Identifier: curl 22 * 23 * To compile: 24 * gcc crawler.c $(pkg-config --cflags --libs libxml-2.0 libcurl) 25 * 26 */ 27/* <DESC> 28 * Web crawler based on curl and libxml2 to stress-test curl with 29 * hundreds of concurrent connections to various servers. 30 * </DESC> 31 */ 32 33/* Parameters */ 34int max_con = 200; 35int max_total = 20000; 36int max_requests = 500; 37int max_link_per_page = 5; 38int follow_relative_links = 0; 39char *start_page = "https://www.reuters.com"; 40 41#include <libxml/HTMLparser.h> 42#include <libxml/xpath.h> 43#include <libxml/uri.h> 44#include <curl/curl.h> 45#include <stdlib.h> 46#include <string.h> 47#include <math.h> 48#include <signal.h> 49 50int pending_interrupt = 0; 51void sighandler(int dummy) 52{ 53 pending_interrupt = 1; 54} 55 56/* resizable buffer */ 57typedef struct { 58 char *buf; 59 size_t size; 60} memory; 61 62size_t grow_buffer(void *contents, size_t sz, size_t nmemb, void *ctx) 63{ 64 size_t realsize = sz * nmemb; 65 memory *mem = (memory*) ctx; 66 char *ptr = realloc(mem->buf, mem->size + realsize); 67 if(!ptr) { 68 /* out of memory */ 69 printf("not enough memory (realloc returned NULL)\n"); 70 return 0; 71 } 72 mem->buf = ptr; 73 memcpy(&(mem->buf[mem->size]), contents, realsize); 74 mem->size += realsize; 75 return realsize; 76} 77 78CURL *make_handle(char *url) 79{ 80 CURL *handle = curl_easy_init(); 81 82 /* Important: use HTTP2 over HTTPS */ 83 curl_easy_setopt(handle, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2TLS); 84 curl_easy_setopt(handle, CURLOPT_URL, url); 85 86 /* buffer body */ 87 memory *mem = malloc(sizeof(memory)); 88 mem->size = 0; 89 mem->buf = malloc(1); 90 curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, grow_buffer); 91 curl_easy_setopt(handle, CURLOPT_WRITEDATA, mem); 92 curl_easy_setopt(handle, CURLOPT_PRIVATE, mem); 93 94 /* For completeness */ 95 curl_easy_setopt(handle, CURLOPT_ACCEPT_ENCODING, ""); 96 curl_easy_setopt(handle, CURLOPT_TIMEOUT, 5L); 97 curl_easy_setopt(handle, CURLOPT_FOLLOWLOCATION, 1L); 98 /* only allow redirects to HTTP and HTTPS URLs */ 99 curl_easy_setopt(handle, CURLOPT_REDIR_PROTOCOLS_STR, "http,https"); 100 curl_easy_setopt(handle, CURLOPT_AUTOREFERER, 1L); 101 curl_easy_setopt(handle, CURLOPT_MAXREDIRS, 10L); 102 /* each transfer needs to be done within 20 seconds! */ 103 curl_easy_setopt(handle, CURLOPT_TIMEOUT_MS, 20000L); 104 /* connect fast or fail */ 105 curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT_MS, 2000L); 106 /* skip files larger than a gigabyte */ 107 curl_easy_setopt(handle, CURLOPT_MAXFILESIZE_LARGE, 108 (curl_off_t)1024*1024*1024); 109 curl_easy_setopt(handle, CURLOPT_COOKIEFILE, ""); 110 curl_easy_setopt(handle, CURLOPT_FILETIME, 1L); 111 curl_easy_setopt(handle, CURLOPT_USERAGENT, "mini crawler"); 112 curl_easy_setopt(handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY); 113 curl_easy_setopt(handle, CURLOPT_UNRESTRICTED_AUTH, 1L); 114 curl_easy_setopt(handle, CURLOPT_PROXYAUTH, CURLAUTH_ANY); 115 curl_easy_setopt(handle, CURLOPT_EXPECT_100_TIMEOUT_MS, 0L); 116 return handle; 117} 118 119/* HREF finder implemented in libxml2 but could be any HTML parser */ 120size_t follow_links(CURLM *multi_handle, memory *mem, char *url) 121{ 122 int opts = HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | \ 123 HTML_PARSE_NOWARNING | HTML_PARSE_NONET; 124 htmlDocPtr doc = htmlReadMemory(mem->buf, mem->size, url, NULL, opts); 125 if(!doc) 126 return 0; 127 xmlChar *xpath = (xmlChar*) "//a/@href"; 128 xmlXPathContextPtr context = xmlXPathNewContext(doc); 129 xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, context); 130 xmlXPathFreeContext(context); 131 if(!result) 132 return 0; 133 xmlNodeSetPtr nodeset = result->nodesetval; 134 if(xmlXPathNodeSetIsEmpty(nodeset)) { 135 xmlXPathFreeObject(result); 136 return 0; 137 } 138 size_t count = 0; 139 int i; 140 for(i = 0; i < nodeset->nodeNr; i++) { 141 double r = rand(); 142 int x = r * nodeset->nodeNr / RAND_MAX; 143 const xmlNode *node = nodeset->nodeTab[x]->xmlChildrenNode; 144 xmlChar *href = xmlNodeListGetString(doc, node, 1); 145 if(follow_relative_links) { 146 xmlChar *orig = href; 147 href = xmlBuildURI(href, (xmlChar *) url); 148 xmlFree(orig); 149 } 150 char *link = (char *) href; 151 if(!link || strlen(link) < 20) 152 continue; 153 if(!strncmp(link, "http://", 7) || !strncmp(link, "https://", 8)) { 154 curl_multi_add_handle(multi_handle, make_handle(link)); 155 if(count++ == max_link_per_page) 156 break; 157 } 158 xmlFree(link); 159 } 160 xmlXPathFreeObject(result); 161 return count; 162} 163 164int is_html(char *ctype) 165{ 166 return ctype != NULL && strlen(ctype) > 10 && strstr(ctype, "text/html"); 167} 168 169int main(void) 170{ 171 signal(SIGINT, sighandler); 172 LIBXML_TEST_VERSION; 173 curl_global_init(CURL_GLOBAL_DEFAULT); 174 CURLM *multi_handle = curl_multi_init(); 175 curl_multi_setopt(multi_handle, CURLMOPT_MAX_TOTAL_CONNECTIONS, max_con); 176 curl_multi_setopt(multi_handle, CURLMOPT_MAX_HOST_CONNECTIONS, 6L); 177 178 /* enables http/2 if available */ 179#ifdef CURLPIPE_MULTIPLEX 180 curl_multi_setopt(multi_handle, CURLMOPT_PIPELINING, CURLPIPE_MULTIPLEX); 181#endif 182 183 /* sets html start page */ 184 curl_multi_add_handle(multi_handle, make_handle(start_page)); 185 186 int msgs_left; 187 int pending = 0; 188 int complete = 0; 189 int still_running = 1; 190 while(still_running && !pending_interrupt) { 191 int numfds; 192 curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds); 193 curl_multi_perform(multi_handle, &still_running); 194 195 /* See how the transfers went */ 196 CURLMsg *m = NULL; 197 while((m = curl_multi_info_read(multi_handle, &msgs_left))) { 198 if(m->msg == CURLMSG_DONE) { 199 CURL *handle = m->easy_handle; 200 char *url; 201 memory *mem; 202 curl_easy_getinfo(handle, CURLINFO_PRIVATE, &mem); 203 curl_easy_getinfo(handle, CURLINFO_EFFECTIVE_URL, &url); 204 if(m->data.result == CURLE_OK) { 205 long res_status; 206 curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &res_status); 207 if(res_status == 200) { 208 char *ctype; 209 curl_easy_getinfo(handle, CURLINFO_CONTENT_TYPE, &ctype); 210 printf("[%d] HTTP 200 (%s): %s\n", complete, ctype, url); 211 if(is_html(ctype) && mem->size > 100) { 212 if(pending < max_requests && (complete + pending) < max_total) { 213 pending += follow_links(multi_handle, mem, url); 214 still_running = 1; 215 } 216 } 217 } 218 else { 219 printf("[%d] HTTP %d: %s\n", complete, (int) res_status, url); 220 } 221 } 222 else { 223 printf("[%d] Connection failure: %s\n", complete, url); 224 } 225 curl_multi_remove_handle(multi_handle, handle); 226 curl_easy_cleanup(handle); 227 free(mem->buf); 228 free(mem); 229 complete++; 230 pending--; 231 } 232 } 233 } 234 curl_multi_cleanup(multi_handle); 235 curl_global_cleanup(); 236 return 0; 237} 238