113498266Sopenharmony_ci/***************************************************************************
213498266Sopenharmony_ci *                                  _   _ ____  _
313498266Sopenharmony_ci *  Project                     ___| | | |  _ \| |
413498266Sopenharmony_ci *                             / __| | | | |_) | |
513498266Sopenharmony_ci *                            | (__| |_| |  _ <| |___
613498266Sopenharmony_ci *                             \___|\___/|_| \_\_____|
713498266Sopenharmony_ci *
813498266Sopenharmony_ci * Copyright (C) Jeroen Ooms <jeroenooms@gmail.com>
913498266Sopenharmony_ci *
1013498266Sopenharmony_ci * This software is licensed as described in the file COPYING, which
1113498266Sopenharmony_ci * you should have received as part of this distribution. The terms
1213498266Sopenharmony_ci * are also available at https://curl.se/docs/copyright.html.
1313498266Sopenharmony_ci *
1413498266Sopenharmony_ci * You may opt to use, copy, modify, merge, publish, distribute and/or sell
1513498266Sopenharmony_ci * copies of the Software, and permit persons to whom the Software is
1613498266Sopenharmony_ci * furnished to do so, under the terms of the COPYING file.
1713498266Sopenharmony_ci *
1813498266Sopenharmony_ci * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
1913498266Sopenharmony_ci * KIND, either express or implied.
2013498266Sopenharmony_ci *
2113498266Sopenharmony_ci * SPDX-License-Identifier: curl
2213498266Sopenharmony_ci *
2313498266Sopenharmony_ci * To compile:
2413498266Sopenharmony_ci *   gcc crawler.c $(pkg-config --cflags --libs libxml-2.0 libcurl)
2513498266Sopenharmony_ci *
2613498266Sopenharmony_ci */
2713498266Sopenharmony_ci/* <DESC>
2813498266Sopenharmony_ci * Web crawler based on curl and libxml2 to stress-test curl with
2913498266Sopenharmony_ci * hundreds of concurrent connections to various servers.
3013498266Sopenharmony_ci * </DESC>
3113498266Sopenharmony_ci */
3213498266Sopenharmony_ci
3313498266Sopenharmony_ci/* Parameters */
3413498266Sopenharmony_ciint max_con = 200;
3513498266Sopenharmony_ciint max_total = 20000;
3613498266Sopenharmony_ciint max_requests = 500;
3713498266Sopenharmony_ciint max_link_per_page = 5;
3813498266Sopenharmony_ciint follow_relative_links = 0;
3913498266Sopenharmony_cichar *start_page = "https://www.reuters.com";
4013498266Sopenharmony_ci
4113498266Sopenharmony_ci#include <libxml/HTMLparser.h>
4213498266Sopenharmony_ci#include <libxml/xpath.h>
4313498266Sopenharmony_ci#include <libxml/uri.h>
4413498266Sopenharmony_ci#include <curl/curl.h>
4513498266Sopenharmony_ci#include <stdlib.h>
4613498266Sopenharmony_ci#include <string.h>
4713498266Sopenharmony_ci#include <math.h>
4813498266Sopenharmony_ci#include <signal.h>
4913498266Sopenharmony_ci
5013498266Sopenharmony_ciint pending_interrupt = 0;
5113498266Sopenharmony_civoid sighandler(int dummy)
5213498266Sopenharmony_ci{
5313498266Sopenharmony_ci  pending_interrupt = 1;
5413498266Sopenharmony_ci}
5513498266Sopenharmony_ci
5613498266Sopenharmony_ci/* resizable buffer */
5713498266Sopenharmony_citypedef struct {
5813498266Sopenharmony_ci  char *buf;
5913498266Sopenharmony_ci  size_t size;
6013498266Sopenharmony_ci} memory;
6113498266Sopenharmony_ci
6213498266Sopenharmony_cisize_t grow_buffer(void *contents, size_t sz, size_t nmemb, void *ctx)
6313498266Sopenharmony_ci{
6413498266Sopenharmony_ci  size_t realsize = sz * nmemb;
6513498266Sopenharmony_ci  memory *mem = (memory*) ctx;
6613498266Sopenharmony_ci  char *ptr = realloc(mem->buf, mem->size + realsize);
6713498266Sopenharmony_ci  if(!ptr) {
6813498266Sopenharmony_ci    /* out of memory */
6913498266Sopenharmony_ci    printf("not enough memory (realloc returned NULL)\n");
7013498266Sopenharmony_ci    return 0;
7113498266Sopenharmony_ci  }
7213498266Sopenharmony_ci  mem->buf = ptr;
7313498266Sopenharmony_ci  memcpy(&(mem->buf[mem->size]), contents, realsize);
7413498266Sopenharmony_ci  mem->size += realsize;
7513498266Sopenharmony_ci  return realsize;
7613498266Sopenharmony_ci}
7713498266Sopenharmony_ci
7813498266Sopenharmony_ciCURL *make_handle(char *url)
7913498266Sopenharmony_ci{
8013498266Sopenharmony_ci  CURL *handle = curl_easy_init();
8113498266Sopenharmony_ci
8213498266Sopenharmony_ci  /* Important: use HTTP2 over HTTPS */
8313498266Sopenharmony_ci  curl_easy_setopt(handle, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2TLS);
8413498266Sopenharmony_ci  curl_easy_setopt(handle, CURLOPT_URL, url);
8513498266Sopenharmony_ci
8613498266Sopenharmony_ci  /* buffer body */
8713498266Sopenharmony_ci  memory *mem = malloc(sizeof(memory));
8813498266Sopenharmony_ci  mem->size = 0;
8913498266Sopenharmony_ci  mem->buf = malloc(1);
9013498266Sopenharmony_ci  curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, grow_buffer);
9113498266Sopenharmony_ci  curl_easy_setopt(handle, CURLOPT_WRITEDATA, mem);
9213498266Sopenharmony_ci  curl_easy_setopt(handle, CURLOPT_PRIVATE, mem);
9313498266Sopenharmony_ci
9413498266Sopenharmony_ci  /* For completeness */
9513498266Sopenharmony_ci  curl_easy_setopt(handle, CURLOPT_ACCEPT_ENCODING, "");
9613498266Sopenharmony_ci  curl_easy_setopt(handle, CURLOPT_TIMEOUT, 5L);
9713498266Sopenharmony_ci  curl_easy_setopt(handle, CURLOPT_FOLLOWLOCATION, 1L);
9813498266Sopenharmony_ci  /* only allow redirects to HTTP and HTTPS URLs */
9913498266Sopenharmony_ci  curl_easy_setopt(handle, CURLOPT_REDIR_PROTOCOLS_STR, "http,https");
10013498266Sopenharmony_ci  curl_easy_setopt(handle, CURLOPT_AUTOREFERER, 1L);
10113498266Sopenharmony_ci  curl_easy_setopt(handle, CURLOPT_MAXREDIRS, 10L);
10213498266Sopenharmony_ci  /* each transfer needs to be done within 20 seconds! */
10313498266Sopenharmony_ci  curl_easy_setopt(handle, CURLOPT_TIMEOUT_MS, 20000L);
10413498266Sopenharmony_ci  /* connect fast or fail */
10513498266Sopenharmony_ci  curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT_MS, 2000L);
10613498266Sopenharmony_ci  /* skip files larger than a gigabyte */
10713498266Sopenharmony_ci  curl_easy_setopt(handle, CURLOPT_MAXFILESIZE_LARGE,
10813498266Sopenharmony_ci                   (curl_off_t)1024*1024*1024);
10913498266Sopenharmony_ci  curl_easy_setopt(handle, CURLOPT_COOKIEFILE, "");
11013498266Sopenharmony_ci  curl_easy_setopt(handle, CURLOPT_FILETIME, 1L);
11113498266Sopenharmony_ci  curl_easy_setopt(handle, CURLOPT_USERAGENT, "mini crawler");
11213498266Sopenharmony_ci  curl_easy_setopt(handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
11313498266Sopenharmony_ci  curl_easy_setopt(handle, CURLOPT_UNRESTRICTED_AUTH, 1L);
11413498266Sopenharmony_ci  curl_easy_setopt(handle, CURLOPT_PROXYAUTH, CURLAUTH_ANY);
11513498266Sopenharmony_ci  curl_easy_setopt(handle, CURLOPT_EXPECT_100_TIMEOUT_MS, 0L);
11613498266Sopenharmony_ci  return handle;
11713498266Sopenharmony_ci}
11813498266Sopenharmony_ci
11913498266Sopenharmony_ci/* HREF finder implemented in libxml2 but could be any HTML parser */
12013498266Sopenharmony_cisize_t follow_links(CURLM *multi_handle, memory *mem, char *url)
12113498266Sopenharmony_ci{
12213498266Sopenharmony_ci  int opts = HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | \
12313498266Sopenharmony_ci             HTML_PARSE_NOWARNING | HTML_PARSE_NONET;
12413498266Sopenharmony_ci  htmlDocPtr doc = htmlReadMemory(mem->buf, mem->size, url, NULL, opts);
12513498266Sopenharmony_ci  if(!doc)
12613498266Sopenharmony_ci    return 0;
12713498266Sopenharmony_ci  xmlChar *xpath = (xmlChar*) "//a/@href";
12813498266Sopenharmony_ci  xmlXPathContextPtr context = xmlXPathNewContext(doc);
12913498266Sopenharmony_ci  xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, context);
13013498266Sopenharmony_ci  xmlXPathFreeContext(context);
13113498266Sopenharmony_ci  if(!result)
13213498266Sopenharmony_ci    return 0;
13313498266Sopenharmony_ci  xmlNodeSetPtr nodeset = result->nodesetval;
13413498266Sopenharmony_ci  if(xmlXPathNodeSetIsEmpty(nodeset)) {
13513498266Sopenharmony_ci    xmlXPathFreeObject(result);
13613498266Sopenharmony_ci    return 0;
13713498266Sopenharmony_ci  }
13813498266Sopenharmony_ci  size_t count = 0;
13913498266Sopenharmony_ci  int i;
14013498266Sopenharmony_ci  for(i = 0; i < nodeset->nodeNr; i++) {
14113498266Sopenharmony_ci    double r = rand();
14213498266Sopenharmony_ci    int x = r * nodeset->nodeNr / RAND_MAX;
14313498266Sopenharmony_ci    const xmlNode *node = nodeset->nodeTab[x]->xmlChildrenNode;
14413498266Sopenharmony_ci    xmlChar *href = xmlNodeListGetString(doc, node, 1);
14513498266Sopenharmony_ci    if(follow_relative_links) {
14613498266Sopenharmony_ci      xmlChar *orig = href;
14713498266Sopenharmony_ci      href = xmlBuildURI(href, (xmlChar *) url);
14813498266Sopenharmony_ci      xmlFree(orig);
14913498266Sopenharmony_ci    }
15013498266Sopenharmony_ci    char *link = (char *) href;
15113498266Sopenharmony_ci    if(!link || strlen(link) < 20)
15213498266Sopenharmony_ci      continue;
15313498266Sopenharmony_ci    if(!strncmp(link, "http://", 7) || !strncmp(link, "https://", 8)) {
15413498266Sopenharmony_ci      curl_multi_add_handle(multi_handle, make_handle(link));
15513498266Sopenharmony_ci      if(count++ == max_link_per_page)
15613498266Sopenharmony_ci        break;
15713498266Sopenharmony_ci    }
15813498266Sopenharmony_ci    xmlFree(link);
15913498266Sopenharmony_ci  }
16013498266Sopenharmony_ci  xmlXPathFreeObject(result);
16113498266Sopenharmony_ci  return count;
16213498266Sopenharmony_ci}
16313498266Sopenharmony_ci
16413498266Sopenharmony_ciint is_html(char *ctype)
16513498266Sopenharmony_ci{
16613498266Sopenharmony_ci  return ctype != NULL && strlen(ctype) > 10 && strstr(ctype, "text/html");
16713498266Sopenharmony_ci}
16813498266Sopenharmony_ci
16913498266Sopenharmony_ciint main(void)
17013498266Sopenharmony_ci{
17113498266Sopenharmony_ci  signal(SIGINT, sighandler);
17213498266Sopenharmony_ci  LIBXML_TEST_VERSION;
17313498266Sopenharmony_ci  curl_global_init(CURL_GLOBAL_DEFAULT);
17413498266Sopenharmony_ci  CURLM *multi_handle = curl_multi_init();
17513498266Sopenharmony_ci  curl_multi_setopt(multi_handle, CURLMOPT_MAX_TOTAL_CONNECTIONS, max_con);
17613498266Sopenharmony_ci  curl_multi_setopt(multi_handle, CURLMOPT_MAX_HOST_CONNECTIONS, 6L);
17713498266Sopenharmony_ci
17813498266Sopenharmony_ci  /* enables http/2 if available */
17913498266Sopenharmony_ci#ifdef CURLPIPE_MULTIPLEX
18013498266Sopenharmony_ci  curl_multi_setopt(multi_handle, CURLMOPT_PIPELINING, CURLPIPE_MULTIPLEX);
18113498266Sopenharmony_ci#endif
18213498266Sopenharmony_ci
18313498266Sopenharmony_ci  /* sets html start page */
18413498266Sopenharmony_ci  curl_multi_add_handle(multi_handle, make_handle(start_page));
18513498266Sopenharmony_ci
18613498266Sopenharmony_ci  int msgs_left;
18713498266Sopenharmony_ci  int pending = 0;
18813498266Sopenharmony_ci  int complete = 0;
18913498266Sopenharmony_ci  int still_running = 1;
19013498266Sopenharmony_ci  while(still_running && !pending_interrupt) {
19113498266Sopenharmony_ci    int numfds;
19213498266Sopenharmony_ci    curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds);
19313498266Sopenharmony_ci    curl_multi_perform(multi_handle, &still_running);
19413498266Sopenharmony_ci
19513498266Sopenharmony_ci    /* See how the transfers went */
19613498266Sopenharmony_ci    CURLMsg *m = NULL;
19713498266Sopenharmony_ci    while((m = curl_multi_info_read(multi_handle, &msgs_left))) {
19813498266Sopenharmony_ci      if(m->msg == CURLMSG_DONE) {
19913498266Sopenharmony_ci        CURL *handle = m->easy_handle;
20013498266Sopenharmony_ci        char *url;
20113498266Sopenharmony_ci        memory *mem;
20213498266Sopenharmony_ci        curl_easy_getinfo(handle, CURLINFO_PRIVATE, &mem);
20313498266Sopenharmony_ci        curl_easy_getinfo(handle, CURLINFO_EFFECTIVE_URL, &url);
20413498266Sopenharmony_ci        if(m->data.result == CURLE_OK) {
20513498266Sopenharmony_ci          long res_status;
20613498266Sopenharmony_ci          curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &res_status);
20713498266Sopenharmony_ci          if(res_status == 200) {
20813498266Sopenharmony_ci            char *ctype;
20913498266Sopenharmony_ci            curl_easy_getinfo(handle, CURLINFO_CONTENT_TYPE, &ctype);
21013498266Sopenharmony_ci            printf("[%d] HTTP 200 (%s): %s\n", complete, ctype, url);
21113498266Sopenharmony_ci            if(is_html(ctype) && mem->size > 100) {
21213498266Sopenharmony_ci              if(pending < max_requests && (complete + pending) < max_total) {
21313498266Sopenharmony_ci                pending += follow_links(multi_handle, mem, url);
21413498266Sopenharmony_ci                still_running = 1;
21513498266Sopenharmony_ci              }
21613498266Sopenharmony_ci            }
21713498266Sopenharmony_ci          }
21813498266Sopenharmony_ci          else {
21913498266Sopenharmony_ci            printf("[%d] HTTP %d: %s\n", complete, (int) res_status, url);
22013498266Sopenharmony_ci          }
22113498266Sopenharmony_ci        }
22213498266Sopenharmony_ci        else {
22313498266Sopenharmony_ci          printf("[%d] Connection failure: %s\n", complete, url);
22413498266Sopenharmony_ci        }
22513498266Sopenharmony_ci        curl_multi_remove_handle(multi_handle, handle);
22613498266Sopenharmony_ci        curl_easy_cleanup(handle);
22713498266Sopenharmony_ci        free(mem->buf);
22813498266Sopenharmony_ci        free(mem);
22913498266Sopenharmony_ci        complete++;
23013498266Sopenharmony_ci        pending--;
23113498266Sopenharmony_ci      }
23213498266Sopenharmony_ci    }
23313498266Sopenharmony_ci  }
23413498266Sopenharmony_ci  curl_multi_cleanup(multi_handle);
23513498266Sopenharmony_ci  curl_global_cleanup();
23613498266Sopenharmony_ci  return 0;
23713498266Sopenharmony_ci}
238