xref: /third_party/toybox/toys/pending/wget.c (revision 0f66f451)
1/* wget.c - Simple downloader to get the resource file from a HTTP server
2 *
3 * Copyright 2016 Lipi C.H. Lee <lipisoft@gmail.com>
4 * Copyright 2021 Eric Molitor <eric@molitor.org>
5 *
6 * Relevant sources of information
7 * -------------------------------
8 * HTTP 1.1: https://www.rfc-editor.org/rfc/rfc7230
9 * Chunked Encoding: https://www.rfc-editor.org/rfc/rfc7230#section-4.1
10 * UTF-8 Encoded Header Values https://www.rfc-editor.org/rfc/rfc5987
11 *
12 * Test URLs
13 * ---------
14 * Chunked Encoding: https://jigsaw.w3.org/HTTP/ChunkedScript
15 * Redirect 301: https://jigsaw.w3.org/HTTP/300/301.html
16 * Redirect 302: https://jigsaw.w3.org/HTTP/300/302.html
17 * TLS 1.0: https://tls-v1-0.badssl.com:1010/
18 * TLS 1.1: https://tls-v1-1.badssl.com:1011/
19 * TLS 1.2: https://tls-v1-2.badssl.com:1012/
20 * TLS 1.3: https://tls13.1d.pw/
21 * Transfer Encoding [gzip|deflate]: https://jigsaw.w3.org/HTTP/TE/bar.txt
22 *
23 *
24 * todo: Add support for configurable TLS versions
25 * todo: Add support for ftp
26 * todo: Add support for Transfer Encoding (gzip|deflate)
27 * todo: Add support for RFC5987
28
29USE_WGET(NEWTOY(wget, "<1>1(max-redirect)#<0=20d(debug)O(output-document):", TOYFLAG_USR|TOYFLAG_BIN))
30
31config WGET
32  bool "wget"
33  default n
34  help
35    usage: wget [OPTIONS]... [URL]
36        --max-redirect          maximum redirections allowed
37    -d, --debug                 print lots of debugging information
38    -O, --output-document=FILE  specify output filename
39
40    examples:
41      wget http://www.example.com
42
43config WGET_LIBTLS
44  bool "Enable HTTPS support for wget via LibTLS"
45  default n
46  depends on WGET && !WGET_OPENSSL
47  help
48    Enable HTTPS support for wget by linking to LibTLS.
49    Supports using libtls, libretls or libtls-bearssl.
50
51config WGET_OPENSSL
52  bool "Enable HTTPS support for wget via OpenSSL"
53  default n
54  depends on WGET && !WGET_LIBTLS
55  help
56    Enable HTTPS support for wget by linking to OpenSSL.
57*/
58
59#define FOR_wget
60#include "toys.h"
61
62#if CFG_WGET_LIBTLS
63#define WGET_SSL 1
64#include <tls.h>
65#elif CFG_WGET_OPENSSL
66#define WGET_SSL 1
67#include <openssl/crypto.h>
68#include <openssl/ssl.h>
69#include <openssl/err.h>
70#else
71#define WGET_SSL 0
72#endif
73
74#define WGET_FILENAME         "Content-Disposition: attachment; filename="
75#define WGET_CHUNKED          "transfer-encoding: chunked"
76#define WGET_LOCATION         "Location: "
77#define WGET_LIBTLS_PROTOCOLS "tlsv1.2"
78
79#define WGET_IS_HTTP  (strncmp(TT.url, "http://", 7) == 0)
80#define WGET_IS_HTTPS (WGET_SSL && (strncmp(TT.url, "https://", 8) == 0))
81
82GLOBALS(
83  char *filename;
84  long redirects;
85
86  int sock;
87  char *url;
88#if CFG_WGET_LIBTLS
89  struct tls *tls;
90#elif CFG_WGET_OPENSSL
91  struct ssl_ctx_st *ctx;
92  struct ssl_st *ssl;
93#endif
94)
95
96static char *wget_strncaseafter(char *haystack, char *needle)
97{
98  char *result = strcasestr(haystack, needle);
99  if (result) result = result + strlen(needle);
100  return result;
101}
102
103// get http info in URL
104static void wget_info(char *url, char **host, char **port, char **path)
105{
106  *host = strafter(url, "://");
107  *path = strchr(*host, '/');
108
109  if ((*path = strchr(*host, '/'))) {
110    **path = '\0';
111    *path = *path + 1;
112  } else {
113    *path = "";
114  }
115
116  if ( *host[0] == '[' && strchr(*host, ']') ) { // IPv6
117    *port = strafter(*host, "]:");
118    *host = *host + 1;
119    strchr(*host, ']')[0] = '\0';
120  } else { // IPv4
121    if ((*port = strchr(*host, ':'))) {
122      **port = '\0';
123      *port = *port + 1;
124    }
125  }
126
127  if (!*port && WGET_IS_HTTP) *port = "80";
128  else if (!*port && WGET_IS_HTTPS) *port = "443";
129  else if (!*port) error_exit("unsupported protocol");
130}
131
132static void wget_connect(char *host, char *port)
133{
134  if (WGET_IS_HTTP) {
135    struct addrinfo *a =
136        xgetaddrinfo(host, port, AF_UNSPEC, SOCK_STREAM, 0, 0);
137    TT.sock = xconnectany(a);
138  } else if (WGET_IS_HTTPS) {
139#if CFG_WGET_LIBTLS
140    struct tls_config *cfg = NULL;
141    uint32_t protocols;
142    if ((TT.tls = tls_client()) == NULL)
143      error_exit("tls_client: %s", tls_error(TT.tls));
144    if ((cfg = tls_config_new()) == NULL)
145      error_exit("tls_config_new: %s", tls_config_error(cfg));
146    if (tls_config_parse_protocols(&protocols, WGET_LIBTLS_PROTOCOLS) != 0)
147      error_exit("tls_config_parse_protocols");
148    if (tls_config_set_protocols(cfg, protocols) != 0)
149      error_exit("tls_config_set_protocols: %s", tls_config_error(cfg));
150    if (tls_configure(TT.tls, cfg) != 0)
151      error_exit("tls_configure: %s", tls_error(TT.tls));
152    tls_config_free(cfg);
153
154    if (tls_connect(TT.tls, host, port) != 0)
155      error_exit("tls_connect: %s", tls_error(TT.tls));
156#elif CFG_WGET_OPENSSL
157    SSL_library_init();
158    OpenSSL_add_all_algorithms();
159    SSL_load_error_strings();
160    ERR_load_crypto_strings();
161
162    TT.ctx = SSL_CTX_new(TLS_client_method());
163    if (!TT.ctx) error_exit("SSL_CTX_new");
164
165    struct addrinfo *a =
166        xgetaddrinfo(host, port, AF_UNSPEC, SOCK_STREAM, 0, 0);
167    TT.sock = xconnectany(a);
168
169    TT.ssl = SSL_new(TT.ctx);
170    if (!TT.ssl)
171      error_exit("SSL_new: %s", ERR_error_string(ERR_get_error(), NULL));
172
173    if (!SSL_set_tlsext_host_name(TT.ssl, host))
174      error_exit("SSL_set_tlsext_host_name: %s",
175                 ERR_error_string(ERR_get_error(), NULL));
176
177    SSL_set_fd(TT.ssl, TT.sock);
178    if (SSL_connect(TT.ssl) == -1)
179      error_exit("SSL_set_fd: %s", ERR_error_string(ERR_get_error(), NULL));
180
181    if (FLAG(d)) printf("TLS: %s\n", SSL_get_cipher(TT.ssl));
182#endif
183  } else error_exit("unsupported protocol");
184}
185
186static size_t wget_read(void *buf, size_t len)
187{
188  if (WGET_IS_HTTP) return xread(TT.sock, buf, len);
189  else if (WGET_IS_HTTPS) {
190#if CFG_WGET_LIBTLS
191   ssize_t ret = tls_read(TT.tls, buf, len);
192   if (ret < 0) error_exit("tls_read: %s", tls_error(TT.tls));
193   return ret;
194#elif CFG_WGET_OPENSSL
195   int ret = SSL_read(TT.ssl, buf, (int) len);
196   if (ret < 0)
197     error_exit("SSL_read: %s", ERR_error_string(ERR_get_error(), NULL));
198   return ret;
199#endif
200  } else error_exit("unsupported protocol");
201}
202
203static void wget_write(void *buf, size_t len)
204{
205  if (WGET_IS_HTTP) {
206    xwrite(TT.sock, buf, len);
207  } else if (WGET_IS_HTTPS) {
208#if CFG_WGET_LIBTLS
209    if (len != tls_write(TT.tls, buf, len))
210      error_exit("tls_write: %s", tls_error(TT.tls));
211#elif CFG_WGET_OPENSSL
212    if (len != SSL_write(TT.ssl, buf, (int) len))
213      error_exit("SSL_write: %s", ERR_error_string(ERR_get_error(), NULL));
214#endif
215  } else error_exit("unsupported protocol");
216}
217
218static void wget_close()
219{
220  if (TT.sock) {
221      xclose(TT.sock);
222      TT.sock = 0;
223  }
224
225#if CFG_WGET_LIBTLS
226  if (TT.tls) {
227    tls_close(TT.tls);
228    tls_free(TT.tls);
229    TT.tls = NULL;
230  }
231#elif CFG_WGET_OPENSSL
232  if (TT.ssl) {
233    SSL_shutdown(TT.ssl);
234    SSL_free(TT.ssl);
235    TT.ssl = NULL;
236  }
237
238  if (TT.ctx) {
239    SSL_CTX_free(TT.ctx);
240    TT.ctx = NULL;
241  }
242#endif
243}
244
245static char* wget_find_header(char *header, char *val) {
246  char *v= wget_strncaseafter(header, val);
247  return v;
248}
249
250static int wget_has_header(char *header, char *val)
251{
252  return wget_find_header(header, val) != NULL;
253}
254
255static char *wget_redirect(char *header)
256{
257  char *redir = wget_find_header(header, WGET_LOCATION);
258  if (!redir) error_exit("could not parse redirect URL");
259  return xstrndup(redir, stridx(redir, '\r'));
260}
261
262static char *wget_filename(char *header, char *path)
263{
264  char *f = wget_find_header(header, WGET_FILENAME);
265  if (f) strchr(f, '\r')[0] = '\0';
266
267  if (!f && strchr(path, '/')) f = getbasename(path);
268  if (!f || !(*f) ) f = "index.html";
269
270  return f;
271}
272
273void wget_main(void)
274{
275  long status = 0;
276  size_t len, c_len = 0;
277  int fd, chunked;
278  char *body, *index, *host, *port, *path;
279  char agent[] = "toybox wget/" TOYBOX_VERSION;
280
281  TT.url = xstrdup(toys.optargs[0]);
282
283  for (;status != 200; TT.redirects--) {
284    if (TT.redirects < 0) error_exit("Too many redirects");
285
286    wget_info(TT.url, &host, &port, &path);
287
288    sprintf(toybuf, "GET /%s HTTP/1.1\r\nHost: %s\r\n"
289                    "User-Agent: %s\r\nConnection: close\r\n\r\n",
290                    path, host, agent);
291    if (FLAG(d)) printf("--- Request\n%s", toybuf);
292
293    wget_connect(host, port);
294    wget_write(toybuf, strlen(toybuf));
295
296    // Greedily read the HTTP response until either complete or toybuf is full
297    index = toybuf;
298    while ((len = wget_read(index, sizeof(toybuf) - (index - toybuf))) > 0)
299      index += len;
300
301    //Process the response such that
302    //  Valid ranges  toybuf[0...index)      valid length is (index - toybuf)
303    //  Header ranges toybuf[0...body)       header length strlen(toybuf)
304    //  Remnant Body  toybuf[body...index)   valid remnant body length is len
305    //
306    // Per RFC7230 the header cannot contain a NUL octet so we NUL terminate at
307    // the footer of the header. This allows for normal string functions to be
308    // used when processing the header.
309    body = memmem(toybuf, index - toybuf, "\r\n\r\n", 4);
310    if (!body) error_exit("response header too large");
311    body[0] = '\0'; // NUL terminate the headers
312    body += 4; // Skip to the head of body
313    len = index - body; // Adjust len to be body length
314    if (FLAG(d)) printf("--- Response\n%s\n\n", toybuf);
315
316    status = strtol(strafter(toybuf, " "), NULL, 10);
317    if ((status == 301) || (status == 302)) {
318      free(TT.url);
319      TT.url = wget_redirect(toybuf);
320      wget_close();
321    } else if (status != 200) error_exit("response: %ld", status);
322  }
323
324  if (!FLAG(O)) {
325    TT.filename = wget_filename(toybuf, path);
326    if (!access(TT.filename, F_OK))
327      error_exit("%s already exists", TT.filename);
328  }
329  fd = xcreate(TT.filename, (O_WRONLY|O_CREAT|O_TRUNC), 0644);
330
331  chunked = wget_has_header(toybuf, WGET_CHUNKED);
332
333  // If chunked we offset the first buffer by 2 character, meaning it is
334  // pointing at half of the header boundary, aka '\r\n'. This simplifies
335  // parsing of the first c_len length by allowing the do while loop to fall
336  // through on the first iteration and parse the first c_len size.
337  if (chunked) {
338    len = len + 2;
339    memmove(toybuf, body - 2, len);
340  } else {
341    memmove(toybuf, body, len);
342  }
343
344  // len is the size remaining in toybuf
345  // c_len is the size of the remaining bytes in the current chunk
346  do {
347    if (chunked) {
348      if (c_len > 0) { // We have an incomplete c_len to write
349        if (len <= c_len) { // Buffer is less than the c_len so full write
350          xwrite(fd, toybuf, len);
351          c_len = c_len - len;
352          len = 0;
353        } else { // Buffer is larger than the c_len so partial write
354          xwrite(fd, toybuf, c_len);
355          len = len - c_len;
356          memmove(toybuf, toybuf + c_len, len);
357          c_len = 0;
358        }
359      }
360
361      // If len is less than 2 we can't validate the chunk boundary so fall
362      // through and go read more into toybuf.
363      if ((c_len == 0) && (len > 2)) {
364        char *c;
365        if (strncmp(toybuf, "\r\n", 2) != 0) error_exit("chunk boundary");
366
367        // If we can't find the end of the new chunk signature fall through and
368        // read more into toybuf.
369        c = memmem(toybuf + 2, len - 2, "\r\n",2);
370        if (c) {
371          c_len = strtol(toybuf + 2, NULL, 16);
372          if (c_len == 0) goto exit; // A c_len of zero means we are complete
373          len = len - (c - toybuf) - 2;
374          memmove(toybuf, c + 2, len);
375        }
376      }
377
378      if (len == sizeof(toybuf)) error_exit("chunk overflow");
379    } else {
380      xwrite(fd, toybuf, len);
381      len = 0;
382    }
383  } while ((len += wget_read(toybuf + len, sizeof(toybuf) - len)) > 0);
384
385  exit:
386  wget_close();
387  free(TT.url);
388}
389