113498266Sopenharmony_ci/***************************************************************************
213498266Sopenharmony_ci *                                  _   _ ____  _
313498266Sopenharmony_ci *  Project                     ___| | | |  _ \| |
413498266Sopenharmony_ci *                             / __| | | | |_) | |
513498266Sopenharmony_ci *                            | (__| |_| |  _ <| |___
613498266Sopenharmony_ci *                             \___|\___/|_| \_\_____|
713498266Sopenharmony_ci *
813498266Sopenharmony_ci * Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al.
913498266Sopenharmony_ci *
1013498266Sopenharmony_ci * This software is licensed as described in the file COPYING, which
1113498266Sopenharmony_ci * you should have received as part of this distribution. The terms
1213498266Sopenharmony_ci * are also available at https://curl.se/docs/copyright.html.
1313498266Sopenharmony_ci *
1413498266Sopenharmony_ci * You may opt to use, copy, modify, merge, publish, distribute and/or sell
1513498266Sopenharmony_ci * copies of the Software, and permit persons to whom the Software is
1613498266Sopenharmony_ci * furnished to do so, under the terms of the COPYING file.
1713498266Sopenharmony_ci *
1813498266Sopenharmony_ci * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
1913498266Sopenharmony_ci * KIND, either express or implied.
2013498266Sopenharmony_ci *
2113498266Sopenharmony_ci * SPDX-License-Identifier: curl
2213498266Sopenharmony_ci *
2313498266Sopenharmony_ci ***************************************************************************/
2413498266Sopenharmony_ci/* <DESC>
2513498266Sopenharmony_ci * Download a document and use libtidy to parse the HTML.
2613498266Sopenharmony_ci * </DESC>
2713498266Sopenharmony_ci */
2813498266Sopenharmony_ci/*
2913498266Sopenharmony_ci * LibTidy => https://www.html-tidy.org/
3013498266Sopenharmony_ci */
3113498266Sopenharmony_ci
3213498266Sopenharmony_ci#include <stdio.h>
3313498266Sopenharmony_ci#include <tidy/tidy.h>
3413498266Sopenharmony_ci#include <tidy/tidybuffio.h>
3513498266Sopenharmony_ci#include <curl/curl.h>
3613498266Sopenharmony_ci
3713498266Sopenharmony_ci/* curl write callback, to fill tidy's input buffer...  */
3813498266Sopenharmony_ciuint write_cb(char *in, uint size, uint nmemb, TidyBuffer *out)
3913498266Sopenharmony_ci{
4013498266Sopenharmony_ci  uint r;
4113498266Sopenharmony_ci  r = size * nmemb;
4213498266Sopenharmony_ci  tidyBufAppend(out, in, r);
4313498266Sopenharmony_ci  return r;
4413498266Sopenharmony_ci}
4513498266Sopenharmony_ci
4613498266Sopenharmony_ci/* Traverse the document tree */
4713498266Sopenharmony_civoid dumpNode(TidyDoc doc, TidyNode tnod, int indent)
4813498266Sopenharmony_ci{
4913498266Sopenharmony_ci  TidyNode child;
5013498266Sopenharmony_ci  for(child = tidyGetChild(tnod); child; child = tidyGetNext(child) ) {
5113498266Sopenharmony_ci    ctmbstr name = tidyNodeGetName(child);
5213498266Sopenharmony_ci    if(name) {
5313498266Sopenharmony_ci      /* if it has a name, then it's an HTML tag ... */
5413498266Sopenharmony_ci      TidyAttr attr;
5513498266Sopenharmony_ci      printf("%*.*s%s ", indent, indent, "<", name);
5613498266Sopenharmony_ci      /* walk the attribute list */
5713498266Sopenharmony_ci      for(attr = tidyAttrFirst(child); attr; attr = tidyAttrNext(attr) ) {
5813498266Sopenharmony_ci        printf("%s", tidyAttrName(attr));
5913498266Sopenharmony_ci        tidyAttrValue(attr)?printf("=\"%s\" ",
6013498266Sopenharmony_ci                                   tidyAttrValue(attr)):printf(" ");
6113498266Sopenharmony_ci      }
6213498266Sopenharmony_ci      printf(">\n");
6313498266Sopenharmony_ci    }
6413498266Sopenharmony_ci    else {
6513498266Sopenharmony_ci      /* if it does not have a name, then it's probably text, cdata, etc... */
6613498266Sopenharmony_ci      TidyBuffer buf;
6713498266Sopenharmony_ci      tidyBufInit(&buf);
6813498266Sopenharmony_ci      tidyNodeGetText(doc, child, &buf);
6913498266Sopenharmony_ci      printf("%*.*s\n", indent, indent, buf.bp?(char *)buf.bp:"");
7013498266Sopenharmony_ci      tidyBufFree(&buf);
7113498266Sopenharmony_ci    }
7213498266Sopenharmony_ci    dumpNode(doc, child, indent + 4); /* recursive */
7313498266Sopenharmony_ci  }
7413498266Sopenharmony_ci}
7513498266Sopenharmony_ci
7613498266Sopenharmony_ci
7713498266Sopenharmony_ciint main(int argc, char **argv)
7813498266Sopenharmony_ci{
7913498266Sopenharmony_ci  if(argc == 2) {
8013498266Sopenharmony_ci    CURL *curl;
8113498266Sopenharmony_ci    char curl_errbuf[CURL_ERROR_SIZE];
8213498266Sopenharmony_ci    TidyDoc tdoc;
8313498266Sopenharmony_ci    TidyBuffer docbuf = {0};
8413498266Sopenharmony_ci    TidyBuffer tidy_errbuf = {0};
8513498266Sopenharmony_ci    int err;
8613498266Sopenharmony_ci
8713498266Sopenharmony_ci    curl = curl_easy_init();
8813498266Sopenharmony_ci    curl_easy_setopt(curl, CURLOPT_URL, argv[1]);
8913498266Sopenharmony_ci    curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_errbuf);
9013498266Sopenharmony_ci    curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
9113498266Sopenharmony_ci    curl_easy_setopt(curl, CURLOPT_VERBOSE, 1L);
9213498266Sopenharmony_ci    curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_cb);
9313498266Sopenharmony_ci
9413498266Sopenharmony_ci    tdoc = tidyCreate();
9513498266Sopenharmony_ci    tidyOptSetBool(tdoc, TidyForceOutput, yes); /* try harder */
9613498266Sopenharmony_ci    tidyOptSetInt(tdoc, TidyWrapLen, 4096);
9713498266Sopenharmony_ci    tidySetErrorBuffer(tdoc, &tidy_errbuf);
9813498266Sopenharmony_ci    tidyBufInit(&docbuf);
9913498266Sopenharmony_ci
10013498266Sopenharmony_ci    curl_easy_setopt(curl, CURLOPT_WRITEDATA, &docbuf);
10113498266Sopenharmony_ci    err = curl_easy_perform(curl);
10213498266Sopenharmony_ci    if(!err) {
10313498266Sopenharmony_ci      err = tidyParseBuffer(tdoc, &docbuf); /* parse the input */
10413498266Sopenharmony_ci      if(err >= 0) {
10513498266Sopenharmony_ci        err = tidyCleanAndRepair(tdoc); /* fix any problems */
10613498266Sopenharmony_ci        if(err >= 0) {
10713498266Sopenharmony_ci          err = tidyRunDiagnostics(tdoc); /* load tidy error buffer */
10813498266Sopenharmony_ci          if(err >= 0) {
10913498266Sopenharmony_ci            dumpNode(tdoc, tidyGetRoot(tdoc), 0); /* walk the tree */
11013498266Sopenharmony_ci            fprintf(stderr, "%s\n", tidy_errbuf.bp); /* show errors */
11113498266Sopenharmony_ci          }
11213498266Sopenharmony_ci        }
11313498266Sopenharmony_ci      }
11413498266Sopenharmony_ci    }
11513498266Sopenharmony_ci    else
11613498266Sopenharmony_ci      fprintf(stderr, "%s\n", curl_errbuf);
11713498266Sopenharmony_ci
11813498266Sopenharmony_ci    /* clean-up */
11913498266Sopenharmony_ci    curl_easy_cleanup(curl);
12013498266Sopenharmony_ci    tidyBufFree(&docbuf);
12113498266Sopenharmony_ci    tidyBufFree(&tidy_errbuf);
12213498266Sopenharmony_ci    tidyRelease(tdoc);
12313498266Sopenharmony_ci    return err;
12413498266Sopenharmony_ci
12513498266Sopenharmony_ci  }
12613498266Sopenharmony_ci  else
12713498266Sopenharmony_ci    printf("usage: %s <url>\n", argv[0]);
12813498266Sopenharmony_ci
12913498266Sopenharmony_ci  return 0;
13013498266Sopenharmony_ci}
131