1/* wget.c - Simple downloader to get the resource file from a HTTP server 2 * 3 * Copyright 2016 Lipi C.H. Lee <lipisoft@gmail.com> 4 * Copyright 2021 Eric Molitor <eric@molitor.org> 5 * 6 * Relevant sources of information 7 * ------------------------------- 8 * HTTP 1.1: https://www.rfc-editor.org/rfc/rfc7230 9 * Chunked Encoding: https://www.rfc-editor.org/rfc/rfc7230#section-4.1 10 * UTF-8 Encoded Header Values https://www.rfc-editor.org/rfc/rfc5987 11 * 12 * Test URLs 13 * --------- 14 * Chunked Encoding: https://jigsaw.w3.org/HTTP/ChunkedScript 15 * Redirect 301: https://jigsaw.w3.org/HTTP/300/301.html 16 * Redirect 302: https://jigsaw.w3.org/HTTP/300/302.html 17 * TLS 1.0: https://tls-v1-0.badssl.com:1010/ 18 * TLS 1.1: https://tls-v1-1.badssl.com:1011/ 19 * TLS 1.2: https://tls-v1-2.badssl.com:1012/ 20 * TLS 1.3: https://tls13.1d.pw/ 21 * Transfer Encoding [gzip|deflate]: https://jigsaw.w3.org/HTTP/TE/bar.txt 22 * 23 * 24 * todo: Add support for configurable TLS versions 25 * todo: Add support for ftp 26 * todo: Add support for Transfer Encoding (gzip|deflate) 27 * todo: Add support for RFC5987 28 29USE_WGET(NEWTOY(wget, "<1>1(max-redirect)#<0=20d(debug)O(output-document):", TOYFLAG_USR|TOYFLAG_BIN)) 30 31config WGET 32 bool "wget" 33 default n 34 help 35 usage: wget [OPTIONS]... [URL] 36 --max-redirect maximum redirections allowed 37 -d, --debug print lots of debugging information 38 -O, --output-document=FILE specify output filename 39 40 examples: 41 wget http://www.example.com 42 43config WGET_LIBTLS 44 bool "Enable HTTPS support for wget via LibTLS" 45 default n 46 depends on WGET && !WGET_OPENSSL 47 help 48 Enable HTTPS support for wget by linking to LibTLS. 49 Supports using libtls, libretls or libtls-bearssl. 50 51config WGET_OPENSSL 52 bool "Enable HTTPS support for wget via OpenSSL" 53 default n 54 depends on WGET && !WGET_LIBTLS 55 help 56 Enable HTTPS support for wget by linking to OpenSSL. 57*/ 58 59#define FOR_wget 60#include "toys.h" 61 62#if CFG_WGET_LIBTLS 63#define WGET_SSL 1 64#include <tls.h> 65#elif CFG_WGET_OPENSSL 66#define WGET_SSL 1 67#include <openssl/crypto.h> 68#include <openssl/ssl.h> 69#include <openssl/err.h> 70#else 71#define WGET_SSL 0 72#endif 73 74#define WGET_FILENAME "Content-Disposition: attachment; filename=" 75#define WGET_CHUNKED "transfer-encoding: chunked" 76#define WGET_LOCATION "Location: " 77#define WGET_LIBTLS_PROTOCOLS "tlsv1.2" 78 79#define WGET_IS_HTTP (strncmp(TT.url, "http://", 7) == 0) 80#define WGET_IS_HTTPS (WGET_SSL && (strncmp(TT.url, "https://", 8) == 0)) 81 82GLOBALS( 83 char *filename; 84 long redirects; 85 86 int sock; 87 char *url; 88#if CFG_WGET_LIBTLS 89 struct tls *tls; 90#elif CFG_WGET_OPENSSL 91 struct ssl_ctx_st *ctx; 92 struct ssl_st *ssl; 93#endif 94) 95 96static char *wget_strncaseafter(char *haystack, char *needle) 97{ 98 char *result = strcasestr(haystack, needle); 99 if (result) result = result + strlen(needle); 100 return result; 101} 102 103// get http info in URL 104static void wget_info(char *url, char **host, char **port, char **path) 105{ 106 *host = strafter(url, "://"); 107 *path = strchr(*host, '/'); 108 109 if ((*path = strchr(*host, '/'))) { 110 **path = '\0'; 111 *path = *path + 1; 112 } else { 113 *path = ""; 114 } 115 116 if ( *host[0] == '[' && strchr(*host, ']') ) { // IPv6 117 *port = strafter(*host, "]:"); 118 *host = *host + 1; 119 strchr(*host, ']')[0] = '\0'; 120 } else { // IPv4 121 if ((*port = strchr(*host, ':'))) { 122 **port = '\0'; 123 *port = *port + 1; 124 } 125 } 126 127 if (!*port && WGET_IS_HTTP) *port = "80"; 128 else if (!*port && WGET_IS_HTTPS) *port = "443"; 129 else if (!*port) error_exit("unsupported protocol"); 130} 131 132static void wget_connect(char *host, char *port) 133{ 134 if (WGET_IS_HTTP) { 135 struct addrinfo *a = 136 xgetaddrinfo(host, port, AF_UNSPEC, SOCK_STREAM, 0, 0); 137 TT.sock = xconnectany(a); 138 } else if (WGET_IS_HTTPS) { 139#if CFG_WGET_LIBTLS 140 struct tls_config *cfg = NULL; 141 uint32_t protocols; 142 if ((TT.tls = tls_client()) == NULL) 143 error_exit("tls_client: %s", tls_error(TT.tls)); 144 if ((cfg = tls_config_new()) == NULL) 145 error_exit("tls_config_new: %s", tls_config_error(cfg)); 146 if (tls_config_parse_protocols(&protocols, WGET_LIBTLS_PROTOCOLS) != 0) 147 error_exit("tls_config_parse_protocols"); 148 if (tls_config_set_protocols(cfg, protocols) != 0) 149 error_exit("tls_config_set_protocols: %s", tls_config_error(cfg)); 150 if (tls_configure(TT.tls, cfg) != 0) 151 error_exit("tls_configure: %s", tls_error(TT.tls)); 152 tls_config_free(cfg); 153 154 if (tls_connect(TT.tls, host, port) != 0) 155 error_exit("tls_connect: %s", tls_error(TT.tls)); 156#elif CFG_WGET_OPENSSL 157 SSL_library_init(); 158 OpenSSL_add_all_algorithms(); 159 SSL_load_error_strings(); 160 ERR_load_crypto_strings(); 161 162 TT.ctx = SSL_CTX_new(TLS_client_method()); 163 if (!TT.ctx) error_exit("SSL_CTX_new"); 164 165 struct addrinfo *a = 166 xgetaddrinfo(host, port, AF_UNSPEC, SOCK_STREAM, 0, 0); 167 TT.sock = xconnectany(a); 168 169 TT.ssl = SSL_new(TT.ctx); 170 if (!TT.ssl) 171 error_exit("SSL_new: %s", ERR_error_string(ERR_get_error(), NULL)); 172 173 if (!SSL_set_tlsext_host_name(TT.ssl, host)) 174 error_exit("SSL_set_tlsext_host_name: %s", 175 ERR_error_string(ERR_get_error(), NULL)); 176 177 SSL_set_fd(TT.ssl, TT.sock); 178 if (SSL_connect(TT.ssl) == -1) 179 error_exit("SSL_set_fd: %s", ERR_error_string(ERR_get_error(), NULL)); 180 181 if (FLAG(d)) printf("TLS: %s\n", SSL_get_cipher(TT.ssl)); 182#endif 183 } else error_exit("unsupported protocol"); 184} 185 186static size_t wget_read(void *buf, size_t len) 187{ 188 if (WGET_IS_HTTP) return xread(TT.sock, buf, len); 189 else if (WGET_IS_HTTPS) { 190#if CFG_WGET_LIBTLS 191 ssize_t ret = tls_read(TT.tls, buf, len); 192 if (ret < 0) error_exit("tls_read: %s", tls_error(TT.tls)); 193 return ret; 194#elif CFG_WGET_OPENSSL 195 int ret = SSL_read(TT.ssl, buf, (int) len); 196 if (ret < 0) 197 error_exit("SSL_read: %s", ERR_error_string(ERR_get_error(), NULL)); 198 return ret; 199#endif 200 } else error_exit("unsupported protocol"); 201} 202 203static void wget_write(void *buf, size_t len) 204{ 205 if (WGET_IS_HTTP) { 206 xwrite(TT.sock, buf, len); 207 } else if (WGET_IS_HTTPS) { 208#if CFG_WGET_LIBTLS 209 if (len != tls_write(TT.tls, buf, len)) 210 error_exit("tls_write: %s", tls_error(TT.tls)); 211#elif CFG_WGET_OPENSSL 212 if (len != SSL_write(TT.ssl, buf, (int) len)) 213 error_exit("SSL_write: %s", ERR_error_string(ERR_get_error(), NULL)); 214#endif 215 } else error_exit("unsupported protocol"); 216} 217 218static void wget_close() 219{ 220 if (TT.sock) { 221 xclose(TT.sock); 222 TT.sock = 0; 223 } 224 225#if CFG_WGET_LIBTLS 226 if (TT.tls) { 227 tls_close(TT.tls); 228 tls_free(TT.tls); 229 TT.tls = NULL; 230 } 231#elif CFG_WGET_OPENSSL 232 if (TT.ssl) { 233 SSL_shutdown(TT.ssl); 234 SSL_free(TT.ssl); 235 TT.ssl = NULL; 236 } 237 238 if (TT.ctx) { 239 SSL_CTX_free(TT.ctx); 240 TT.ctx = NULL; 241 } 242#endif 243} 244 245static char* wget_find_header(char *header, char *val) { 246 char *v= wget_strncaseafter(header, val); 247 return v; 248} 249 250static int wget_has_header(char *header, char *val) 251{ 252 return wget_find_header(header, val) != NULL; 253} 254 255static char *wget_redirect(char *header) 256{ 257 char *redir = wget_find_header(header, WGET_LOCATION); 258 if (!redir) error_exit("could not parse redirect URL"); 259 return xstrndup(redir, stridx(redir, '\r')); 260} 261 262static char *wget_filename(char *header, char *path) 263{ 264 char *f = wget_find_header(header, WGET_FILENAME); 265 if (f) strchr(f, '\r')[0] = '\0'; 266 267 if (!f && strchr(path, '/')) f = getbasename(path); 268 if (!f || !(*f) ) f = "index.html"; 269 270 return f; 271} 272 273void wget_main(void) 274{ 275 long status = 0; 276 size_t len, c_len = 0; 277 int fd, chunked; 278 char *body, *index, *host, *port, *path; 279 char agent[] = "toybox wget/" TOYBOX_VERSION; 280 281 TT.url = xstrdup(toys.optargs[0]); 282 283 for (;status != 200; TT.redirects--) { 284 if (TT.redirects < 0) error_exit("Too many redirects"); 285 286 wget_info(TT.url, &host, &port, &path); 287 288 sprintf(toybuf, "GET /%s HTTP/1.1\r\nHost: %s\r\n" 289 "User-Agent: %s\r\nConnection: close\r\n\r\n", 290 path, host, agent); 291 if (FLAG(d)) printf("--- Request\n%s", toybuf); 292 293 wget_connect(host, port); 294 wget_write(toybuf, strlen(toybuf)); 295 296 // Greedily read the HTTP response until either complete or toybuf is full 297 index = toybuf; 298 while ((len = wget_read(index, sizeof(toybuf) - (index - toybuf))) > 0) 299 index += len; 300 301 //Process the response such that 302 // Valid ranges toybuf[0...index) valid length is (index - toybuf) 303 // Header ranges toybuf[0...body) header length strlen(toybuf) 304 // Remnant Body toybuf[body...index) valid remnant body length is len 305 // 306 // Per RFC7230 the header cannot contain a NUL octet so we NUL terminate at 307 // the footer of the header. This allows for normal string functions to be 308 // used when processing the header. 309 body = memmem(toybuf, index - toybuf, "\r\n\r\n", 4); 310 if (!body) error_exit("response header too large"); 311 body[0] = '\0'; // NUL terminate the headers 312 body += 4; // Skip to the head of body 313 len = index - body; // Adjust len to be body length 314 if (FLAG(d)) printf("--- Response\n%s\n\n", toybuf); 315 316 status = strtol(strafter(toybuf, " "), NULL, 10); 317 if ((status == 301) || (status == 302)) { 318 free(TT.url); 319 TT.url = wget_redirect(toybuf); 320 wget_close(); 321 } else if (status != 200) error_exit("response: %ld", status); 322 } 323 324 if (!FLAG(O)) { 325 TT.filename = wget_filename(toybuf, path); 326 if (!access(TT.filename, F_OK)) 327 error_exit("%s already exists", TT.filename); 328 } 329 fd = xcreate(TT.filename, (O_WRONLY|O_CREAT|O_TRUNC), 0644); 330 331 chunked = wget_has_header(toybuf, WGET_CHUNKED); 332 333 // If chunked we offset the first buffer by 2 character, meaning it is 334 // pointing at half of the header boundary, aka '\r\n'. This simplifies 335 // parsing of the first c_len length by allowing the do while loop to fall 336 // through on the first iteration and parse the first c_len size. 337 if (chunked) { 338 len = len + 2; 339 memmove(toybuf, body - 2, len); 340 } else { 341 memmove(toybuf, body, len); 342 } 343 344 // len is the size remaining in toybuf 345 // c_len is the size of the remaining bytes in the current chunk 346 do { 347 if (chunked) { 348 if (c_len > 0) { // We have an incomplete c_len to write 349 if (len <= c_len) { // Buffer is less than the c_len so full write 350 xwrite(fd, toybuf, len); 351 c_len = c_len - len; 352 len = 0; 353 } else { // Buffer is larger than the c_len so partial write 354 xwrite(fd, toybuf, c_len); 355 len = len - c_len; 356 memmove(toybuf, toybuf + c_len, len); 357 c_len = 0; 358 } 359 } 360 361 // If len is less than 2 we can't validate the chunk boundary so fall 362 // through and go read more into toybuf. 363 if ((c_len == 0) && (len > 2)) { 364 char *c; 365 if (strncmp(toybuf, "\r\n", 2) != 0) error_exit("chunk boundary"); 366 367 // If we can't find the end of the new chunk signature fall through and 368 // read more into toybuf. 369 c = memmem(toybuf + 2, len - 2, "\r\n",2); 370 if (c) { 371 c_len = strtol(toybuf + 2, NULL, 16); 372 if (c_len == 0) goto exit; // A c_len of zero means we are complete 373 len = len - (c - toybuf) - 2; 374 memmove(toybuf, c + 2, len); 375 } 376 } 377 378 if (len == sizeof(toybuf)) error_exit("chunk overflow"); 379 } else { 380 xwrite(fd, toybuf, len); 381 len = 0; 382 } 383 } while ((len += wget_read(toybuf + len, sizeof(toybuf) - len)) > 0); 384 385 exit: 386 wget_close(); 387 free(TT.url); 388} 389