1/*************************************************************************** 2 * _ _ ____ _ 3 * Project ___| | | | _ \| | 4 * / __| | | | |_) | | 5 * | (__| |_| | _ <| |___ 6 * \___|\___/|_| \_\_____| 7 * 8 * Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al. 9 * 10 * This software is licensed as described in the file COPYING, which 11 * you should have received as part of this distribution. The terms 12 * are also available at https://curl.se/docs/copyright.html. 13 * 14 * You may opt to use, copy, modify, merge, publish, distribute and/or sell 15 * copies of the Software, and permit persons to whom the Software is 16 * furnished to do so, under the terms of the COPYING file. 17 * 18 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY 19 * KIND, either express or implied. 20 * 21 * SPDX-License-Identifier: curl 22 * 23 ***************************************************************************/ 24/* <DESC> 25 * Download a document and use libtidy to parse the HTML. 26 * </DESC> 27 */ 28/* 29 * LibTidy => https://www.html-tidy.org/ 30 */ 31 32#include <stdio.h> 33#include <tidy/tidy.h> 34#include <tidy/tidybuffio.h> 35#include <curl/curl.h> 36 37/* curl write callback, to fill tidy's input buffer... */ 38uint write_cb(char *in, uint size, uint nmemb, TidyBuffer *out) 39{ 40 uint r; 41 r = size * nmemb; 42 tidyBufAppend(out, in, r); 43 return r; 44} 45 46/* Traverse the document tree */ 47void dumpNode(TidyDoc doc, TidyNode tnod, int indent) 48{ 49 TidyNode child; 50 for(child = tidyGetChild(tnod); child; child = tidyGetNext(child) ) { 51 ctmbstr name = tidyNodeGetName(child); 52 if(name) { 53 /* if it has a name, then it's an HTML tag ... */ 54 TidyAttr attr; 55 printf("%*.*s%s ", indent, indent, "<", name); 56 /* walk the attribute list */ 57 for(attr = tidyAttrFirst(child); attr; attr = tidyAttrNext(attr) ) { 58 printf("%s", tidyAttrName(attr)); 59 tidyAttrValue(attr)?printf("=\"%s\" ", 60 tidyAttrValue(attr)):printf(" "); 61 } 62 printf(">\n"); 63 } 64 else { 65 /* if it does not have a name, then it's probably text, cdata, etc... */ 66 TidyBuffer buf; 67 tidyBufInit(&buf); 68 tidyNodeGetText(doc, child, &buf); 69 printf("%*.*s\n", indent, indent, buf.bp?(char *)buf.bp:""); 70 tidyBufFree(&buf); 71 } 72 dumpNode(doc, child, indent + 4); /* recursive */ 73 } 74} 75 76 77int main(int argc, char **argv) 78{ 79 if(argc == 2) { 80 CURL *curl; 81 char curl_errbuf[CURL_ERROR_SIZE]; 82 TidyDoc tdoc; 83 TidyBuffer docbuf = {0}; 84 TidyBuffer tidy_errbuf = {0}; 85 int err; 86 87 curl = curl_easy_init(); 88 curl_easy_setopt(curl, CURLOPT_URL, argv[1]); 89 curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_errbuf); 90 curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L); 91 curl_easy_setopt(curl, CURLOPT_VERBOSE, 1L); 92 curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_cb); 93 94 tdoc = tidyCreate(); 95 tidyOptSetBool(tdoc, TidyForceOutput, yes); /* try harder */ 96 tidyOptSetInt(tdoc, TidyWrapLen, 4096); 97 tidySetErrorBuffer(tdoc, &tidy_errbuf); 98 tidyBufInit(&docbuf); 99 100 curl_easy_setopt(curl, CURLOPT_WRITEDATA, &docbuf); 101 err = curl_easy_perform(curl); 102 if(!err) { 103 err = tidyParseBuffer(tdoc, &docbuf); /* parse the input */ 104 if(err >= 0) { 105 err = tidyCleanAndRepair(tdoc); /* fix any problems */ 106 if(err >= 0) { 107 err = tidyRunDiagnostics(tdoc); /* load tidy error buffer */ 108 if(err >= 0) { 109 dumpNode(tdoc, tidyGetRoot(tdoc), 0); /* walk the tree */ 110 fprintf(stderr, "%s\n", tidy_errbuf.bp); /* show errors */ 111 } 112 } 113 } 114 } 115 else 116 fprintf(stderr, "%s\n", curl_errbuf); 117 118 /* clean-up */ 119 curl_easy_cleanup(curl); 120 tidyBufFree(&docbuf); 121 tidyBufFree(&tidy_errbuf); 122 tidyRelease(tdoc); 123 return err; 124 125 } 126 else 127 printf("usage: %s <url>\n", argv[0]); 128 129 return 0; 130} 131