xref: /third_party/curl/docs/examples/crawler.c (revision 13498266)
1/***************************************************************************
2 *                                  _   _ ____  _
3 *  Project                     ___| | | |  _ \| |
4 *                             / __| | | | |_) | |
5 *                            | (__| |_| |  _ <| |___
6 *                             \___|\___/|_| \_\_____|
7 *
8 * Copyright (C) Jeroen Ooms <jeroenooms@gmail.com>
9 *
10 * This software is licensed as described in the file COPYING, which
11 * you should have received as part of this distribution. The terms
12 * are also available at https://curl.se/docs/copyright.html.
13 *
14 * You may opt to use, copy, modify, merge, publish, distribute and/or sell
15 * copies of the Software, and permit persons to whom the Software is
16 * furnished to do so, under the terms of the COPYING file.
17 *
18 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
19 * KIND, either express or implied.
20 *
21 * SPDX-License-Identifier: curl
22 *
23 * To compile:
24 *   gcc crawler.c $(pkg-config --cflags --libs libxml-2.0 libcurl)
25 *
26 */
27/* <DESC>
28 * Web crawler based on curl and libxml2 to stress-test curl with
29 * hundreds of concurrent connections to various servers.
30 * </DESC>
31 */
32
33/* Parameters */
34int max_con = 200;
35int max_total = 20000;
36int max_requests = 500;
37int max_link_per_page = 5;
38int follow_relative_links = 0;
39char *start_page = "https://www.reuters.com";
40
41#include <libxml/HTMLparser.h>
42#include <libxml/xpath.h>
43#include <libxml/uri.h>
44#include <curl/curl.h>
45#include <stdlib.h>
46#include <string.h>
47#include <math.h>
48#include <signal.h>
49
50int pending_interrupt = 0;
51void sighandler(int dummy)
52{
53  pending_interrupt = 1;
54}
55
56/* resizable buffer */
57typedef struct {
58  char *buf;
59  size_t size;
60} memory;
61
62size_t grow_buffer(void *contents, size_t sz, size_t nmemb, void *ctx)
63{
64  size_t realsize = sz * nmemb;
65  memory *mem = (memory*) ctx;
66  char *ptr = realloc(mem->buf, mem->size + realsize);
67  if(!ptr) {
68    /* out of memory */
69    printf("not enough memory (realloc returned NULL)\n");
70    return 0;
71  }
72  mem->buf = ptr;
73  memcpy(&(mem->buf[mem->size]), contents, realsize);
74  mem->size += realsize;
75  return realsize;
76}
77
78CURL *make_handle(char *url)
79{
80  CURL *handle = curl_easy_init();
81
82  /* Important: use HTTP2 over HTTPS */
83  curl_easy_setopt(handle, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2TLS);
84  curl_easy_setopt(handle, CURLOPT_URL, url);
85
86  /* buffer body */
87  memory *mem = malloc(sizeof(memory));
88  mem->size = 0;
89  mem->buf = malloc(1);
90  curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, grow_buffer);
91  curl_easy_setopt(handle, CURLOPT_WRITEDATA, mem);
92  curl_easy_setopt(handle, CURLOPT_PRIVATE, mem);
93
94  /* For completeness */
95  curl_easy_setopt(handle, CURLOPT_ACCEPT_ENCODING, "");
96  curl_easy_setopt(handle, CURLOPT_TIMEOUT, 5L);
97  curl_easy_setopt(handle, CURLOPT_FOLLOWLOCATION, 1L);
98  /* only allow redirects to HTTP and HTTPS URLs */
99  curl_easy_setopt(handle, CURLOPT_REDIR_PROTOCOLS_STR, "http,https");
100  curl_easy_setopt(handle, CURLOPT_AUTOREFERER, 1L);
101  curl_easy_setopt(handle, CURLOPT_MAXREDIRS, 10L);
102  /* each transfer needs to be done within 20 seconds! */
103  curl_easy_setopt(handle, CURLOPT_TIMEOUT_MS, 20000L);
104  /* connect fast or fail */
105  curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT_MS, 2000L);
106  /* skip files larger than a gigabyte */
107  curl_easy_setopt(handle, CURLOPT_MAXFILESIZE_LARGE,
108                   (curl_off_t)1024*1024*1024);
109  curl_easy_setopt(handle, CURLOPT_COOKIEFILE, "");
110  curl_easy_setopt(handle, CURLOPT_FILETIME, 1L);
111  curl_easy_setopt(handle, CURLOPT_USERAGENT, "mini crawler");
112  curl_easy_setopt(handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
113  curl_easy_setopt(handle, CURLOPT_UNRESTRICTED_AUTH, 1L);
114  curl_easy_setopt(handle, CURLOPT_PROXYAUTH, CURLAUTH_ANY);
115  curl_easy_setopt(handle, CURLOPT_EXPECT_100_TIMEOUT_MS, 0L);
116  return handle;
117}
118
119/* HREF finder implemented in libxml2 but could be any HTML parser */
120size_t follow_links(CURLM *multi_handle, memory *mem, char *url)
121{
122  int opts = HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | \
123             HTML_PARSE_NOWARNING | HTML_PARSE_NONET;
124  htmlDocPtr doc = htmlReadMemory(mem->buf, mem->size, url, NULL, opts);
125  if(!doc)
126    return 0;
127  xmlChar *xpath = (xmlChar*) "//a/@href";
128  xmlXPathContextPtr context = xmlXPathNewContext(doc);
129  xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, context);
130  xmlXPathFreeContext(context);
131  if(!result)
132    return 0;
133  xmlNodeSetPtr nodeset = result->nodesetval;
134  if(xmlXPathNodeSetIsEmpty(nodeset)) {
135    xmlXPathFreeObject(result);
136    return 0;
137  }
138  size_t count = 0;
139  int i;
140  for(i = 0; i < nodeset->nodeNr; i++) {
141    double r = rand();
142    int x = r * nodeset->nodeNr / RAND_MAX;
143    const xmlNode *node = nodeset->nodeTab[x]->xmlChildrenNode;
144    xmlChar *href = xmlNodeListGetString(doc, node, 1);
145    if(follow_relative_links) {
146      xmlChar *orig = href;
147      href = xmlBuildURI(href, (xmlChar *) url);
148      xmlFree(orig);
149    }
150    char *link = (char *) href;
151    if(!link || strlen(link) < 20)
152      continue;
153    if(!strncmp(link, "http://", 7) || !strncmp(link, "https://", 8)) {
154      curl_multi_add_handle(multi_handle, make_handle(link));
155      if(count++ == max_link_per_page)
156        break;
157    }
158    xmlFree(link);
159  }
160  xmlXPathFreeObject(result);
161  return count;
162}
163
164int is_html(char *ctype)
165{
166  return ctype != NULL && strlen(ctype) > 10 && strstr(ctype, "text/html");
167}
168
169int main(void)
170{
171  signal(SIGINT, sighandler);
172  LIBXML_TEST_VERSION;
173  curl_global_init(CURL_GLOBAL_DEFAULT);
174  CURLM *multi_handle = curl_multi_init();
175  curl_multi_setopt(multi_handle, CURLMOPT_MAX_TOTAL_CONNECTIONS, max_con);
176  curl_multi_setopt(multi_handle, CURLMOPT_MAX_HOST_CONNECTIONS, 6L);
177
178  /* enables http/2 if available */
179#ifdef CURLPIPE_MULTIPLEX
180  curl_multi_setopt(multi_handle, CURLMOPT_PIPELINING, CURLPIPE_MULTIPLEX);
181#endif
182
183  /* sets html start page */
184  curl_multi_add_handle(multi_handle, make_handle(start_page));
185
186  int msgs_left;
187  int pending = 0;
188  int complete = 0;
189  int still_running = 1;
190  while(still_running && !pending_interrupt) {
191    int numfds;
192    curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds);
193    curl_multi_perform(multi_handle, &still_running);
194
195    /* See how the transfers went */
196    CURLMsg *m = NULL;
197    while((m = curl_multi_info_read(multi_handle, &msgs_left))) {
198      if(m->msg == CURLMSG_DONE) {
199        CURL *handle = m->easy_handle;
200        char *url;
201        memory *mem;
202        curl_easy_getinfo(handle, CURLINFO_PRIVATE, &mem);
203        curl_easy_getinfo(handle, CURLINFO_EFFECTIVE_URL, &url);
204        if(m->data.result == CURLE_OK) {
205          long res_status;
206          curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &res_status);
207          if(res_status == 200) {
208            char *ctype;
209            curl_easy_getinfo(handle, CURLINFO_CONTENT_TYPE, &ctype);
210            printf("[%d] HTTP 200 (%s): %s\n", complete, ctype, url);
211            if(is_html(ctype) && mem->size > 100) {
212              if(pending < max_requests && (complete + pending) < max_total) {
213                pending += follow_links(multi_handle, mem, url);
214                still_running = 1;
215              }
216            }
217          }
218          else {
219            printf("[%d] HTTP %d: %s\n", complete, (int) res_status, url);
220          }
221        }
222        else {
223          printf("[%d] Connection failure: %s\n", complete, url);
224        }
225        curl_multi_remove_handle(multi_handle, handle);
226        curl_easy_cleanup(handle);
227        free(mem->buf);
228        free(mem);
229        complete++;
230        pending--;
231      }
232    }
233  }
234  curl_multi_cleanup(multi_handle);
235  curl_global_cleanup();
236  return 0;
237}
238