113498266Sopenharmony_ci/*************************************************************************** 213498266Sopenharmony_ci * _ _ ____ _ 313498266Sopenharmony_ci * Project ___| | | | _ \| | 413498266Sopenharmony_ci * / __| | | | |_) | | 513498266Sopenharmony_ci * | (__| |_| | _ <| |___ 613498266Sopenharmony_ci * \___|\___/|_| \_\_____| 713498266Sopenharmony_ci * 813498266Sopenharmony_ci * Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al. 913498266Sopenharmony_ci * 1013498266Sopenharmony_ci * This software is licensed as described in the file COPYING, which 1113498266Sopenharmony_ci * you should have received as part of this distribution. The terms 1213498266Sopenharmony_ci * are also available at https://curl.se/docs/copyright.html. 1313498266Sopenharmony_ci * 1413498266Sopenharmony_ci * You may opt to use, copy, modify, merge, publish, distribute and/or sell 1513498266Sopenharmony_ci * copies of the Software, and permit persons to whom the Software is 1613498266Sopenharmony_ci * furnished to do so, under the terms of the COPYING file. 1713498266Sopenharmony_ci * 1813498266Sopenharmony_ci * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY 1913498266Sopenharmony_ci * KIND, either express or implied. 2013498266Sopenharmony_ci * 2113498266Sopenharmony_ci * SPDX-License-Identifier: curl 2213498266Sopenharmony_ci * 2313498266Sopenharmony_ci ***************************************************************************/ 2413498266Sopenharmony_ci/* <DESC> 2513498266Sopenharmony_ci * Download a document and use libtidy to parse the HTML. 2613498266Sopenharmony_ci * </DESC> 2713498266Sopenharmony_ci */ 2813498266Sopenharmony_ci/* 2913498266Sopenharmony_ci * LibTidy => https://www.html-tidy.org/ 3013498266Sopenharmony_ci */ 3113498266Sopenharmony_ci 3213498266Sopenharmony_ci#include <stdio.h> 3313498266Sopenharmony_ci#include <tidy/tidy.h> 3413498266Sopenharmony_ci#include <tidy/tidybuffio.h> 3513498266Sopenharmony_ci#include <curl/curl.h> 3613498266Sopenharmony_ci 3713498266Sopenharmony_ci/* curl write callback, to fill tidy's input buffer... */ 3813498266Sopenharmony_ciuint write_cb(char *in, uint size, uint nmemb, TidyBuffer *out) 3913498266Sopenharmony_ci{ 4013498266Sopenharmony_ci uint r; 4113498266Sopenharmony_ci r = size * nmemb; 4213498266Sopenharmony_ci tidyBufAppend(out, in, r); 4313498266Sopenharmony_ci return r; 4413498266Sopenharmony_ci} 4513498266Sopenharmony_ci 4613498266Sopenharmony_ci/* Traverse the document tree */ 4713498266Sopenharmony_civoid dumpNode(TidyDoc doc, TidyNode tnod, int indent) 4813498266Sopenharmony_ci{ 4913498266Sopenharmony_ci TidyNode child; 5013498266Sopenharmony_ci for(child = tidyGetChild(tnod); child; child = tidyGetNext(child) ) { 5113498266Sopenharmony_ci ctmbstr name = tidyNodeGetName(child); 5213498266Sopenharmony_ci if(name) { 5313498266Sopenharmony_ci /* if it has a name, then it's an HTML tag ... */ 5413498266Sopenharmony_ci TidyAttr attr; 5513498266Sopenharmony_ci printf("%*.*s%s ", indent, indent, "<", name); 5613498266Sopenharmony_ci /* walk the attribute list */ 5713498266Sopenharmony_ci for(attr = tidyAttrFirst(child); attr; attr = tidyAttrNext(attr) ) { 5813498266Sopenharmony_ci printf("%s", tidyAttrName(attr)); 5913498266Sopenharmony_ci tidyAttrValue(attr)?printf("=\"%s\" ", 6013498266Sopenharmony_ci tidyAttrValue(attr)):printf(" "); 6113498266Sopenharmony_ci } 6213498266Sopenharmony_ci printf(">\n"); 6313498266Sopenharmony_ci } 6413498266Sopenharmony_ci else { 6513498266Sopenharmony_ci /* if it does not have a name, then it's probably text, cdata, etc... */ 6613498266Sopenharmony_ci TidyBuffer buf; 6713498266Sopenharmony_ci tidyBufInit(&buf); 6813498266Sopenharmony_ci tidyNodeGetText(doc, child, &buf); 6913498266Sopenharmony_ci printf("%*.*s\n", indent, indent, buf.bp?(char *)buf.bp:""); 7013498266Sopenharmony_ci tidyBufFree(&buf); 7113498266Sopenharmony_ci } 7213498266Sopenharmony_ci dumpNode(doc, child, indent + 4); /* recursive */ 7313498266Sopenharmony_ci } 7413498266Sopenharmony_ci} 7513498266Sopenharmony_ci 7613498266Sopenharmony_ci 7713498266Sopenharmony_ciint main(int argc, char **argv) 7813498266Sopenharmony_ci{ 7913498266Sopenharmony_ci if(argc == 2) { 8013498266Sopenharmony_ci CURL *curl; 8113498266Sopenharmony_ci char curl_errbuf[CURL_ERROR_SIZE]; 8213498266Sopenharmony_ci TidyDoc tdoc; 8313498266Sopenharmony_ci TidyBuffer docbuf = {0}; 8413498266Sopenharmony_ci TidyBuffer tidy_errbuf = {0}; 8513498266Sopenharmony_ci int err; 8613498266Sopenharmony_ci 8713498266Sopenharmony_ci curl = curl_easy_init(); 8813498266Sopenharmony_ci curl_easy_setopt(curl, CURLOPT_URL, argv[1]); 8913498266Sopenharmony_ci curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_errbuf); 9013498266Sopenharmony_ci curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L); 9113498266Sopenharmony_ci curl_easy_setopt(curl, CURLOPT_VERBOSE, 1L); 9213498266Sopenharmony_ci curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_cb); 9313498266Sopenharmony_ci 9413498266Sopenharmony_ci tdoc = tidyCreate(); 9513498266Sopenharmony_ci tidyOptSetBool(tdoc, TidyForceOutput, yes); /* try harder */ 9613498266Sopenharmony_ci tidyOptSetInt(tdoc, TidyWrapLen, 4096); 9713498266Sopenharmony_ci tidySetErrorBuffer(tdoc, &tidy_errbuf); 9813498266Sopenharmony_ci tidyBufInit(&docbuf); 9913498266Sopenharmony_ci 10013498266Sopenharmony_ci curl_easy_setopt(curl, CURLOPT_WRITEDATA, &docbuf); 10113498266Sopenharmony_ci err = curl_easy_perform(curl); 10213498266Sopenharmony_ci if(!err) { 10313498266Sopenharmony_ci err = tidyParseBuffer(tdoc, &docbuf); /* parse the input */ 10413498266Sopenharmony_ci if(err >= 0) { 10513498266Sopenharmony_ci err = tidyCleanAndRepair(tdoc); /* fix any problems */ 10613498266Sopenharmony_ci if(err >= 0) { 10713498266Sopenharmony_ci err = tidyRunDiagnostics(tdoc); /* load tidy error buffer */ 10813498266Sopenharmony_ci if(err >= 0) { 10913498266Sopenharmony_ci dumpNode(tdoc, tidyGetRoot(tdoc), 0); /* walk the tree */ 11013498266Sopenharmony_ci fprintf(stderr, "%s\n", tidy_errbuf.bp); /* show errors */ 11113498266Sopenharmony_ci } 11213498266Sopenharmony_ci } 11313498266Sopenharmony_ci } 11413498266Sopenharmony_ci } 11513498266Sopenharmony_ci else 11613498266Sopenharmony_ci fprintf(stderr, "%s\n", curl_errbuf); 11713498266Sopenharmony_ci 11813498266Sopenharmony_ci /* clean-up */ 11913498266Sopenharmony_ci curl_easy_cleanup(curl); 12013498266Sopenharmony_ci tidyBufFree(&docbuf); 12113498266Sopenharmony_ci tidyBufFree(&tidy_errbuf); 12213498266Sopenharmony_ci tidyRelease(tdoc); 12313498266Sopenharmony_ci return err; 12413498266Sopenharmony_ci 12513498266Sopenharmony_ci } 12613498266Sopenharmony_ci else 12713498266Sopenharmony_ci printf("usage: %s <url>\n", argv[0]); 12813498266Sopenharmony_ci 12913498266Sopenharmony_ci return 0; 13013498266Sopenharmony_ci} 131