[4748] | 1 | /* |
---|
| 2 | * Gaycko Text mode web browser |
---|
| 3 | * Copyright (c) 2011 Jean-Yves Lamoureux <jylam@lnxscene.org> |
---|
| 4 | * All Rights Reserved |
---|
| 5 | * |
---|
| 6 | * This library is free software. It comes without any warranty, to |
---|
| 7 | * the extent permitted by applicable law. You can redistribute it |
---|
| 8 | * and/or modify it under the terms of the Do What The Fuck You Want |
---|
| 9 | * To Public License, Version 2, as published by Sam Hocevar. See |
---|
| 10 | * http://sam.zoy.org/wtfpl/COPYING for more details. |
---|
| 11 | */ |
---|
[4716] | 12 | #include "parse.h" |
---|
| 13 | |
---|
[4720] | 14 | gDOM *gaycko_parse(char *data, unsigned int size) { |
---|
[4716] | 15 | |
---|
| 16 | /* Clean up and repair bad HTML */ |
---|
| 17 | TidyDoc tdoc = tidyCreate(); |
---|
| 18 | TidyBuffer errbuf = {0}; |
---|
| 19 | |
---|
| 20 | TidyBuffer output = {0}; |
---|
| 21 | |
---|
| 22 | Bool ok; |
---|
| 23 | int rc = -1; |
---|
| 24 | |
---|
| 25 | ok = tidyOptSetBool( tdoc, TidyXhtmlOut, yes ); // Convert to XHTML |
---|
| 26 | if ( ok ) |
---|
| 27 | rc = tidySetErrorBuffer( tdoc, &errbuf ); // Capture diagnostics |
---|
| 28 | if ( rc >= 0 ) |
---|
| 29 | rc = tidyParseString( tdoc, data ); // Parse the input |
---|
| 30 | if ( rc >= 0 ) |
---|
| 31 | rc = tidyCleanAndRepair( tdoc ); // Tidy it up! |
---|
| 32 | if ( rc > 1 ) // If error, force output. |
---|
| 33 | rc = ( tidyOptSetBool(tdoc, TidyForceOutput, yes) ? rc : -1 ); |
---|
| 34 | if ( rc >= 0 ) |
---|
| 35 | rc = tidySaveBuffer( tdoc, &output ); // Pretty Print |
---|
| 36 | |
---|
| 37 | /* Actual parsing */ |
---|
| 38 | htmlDocPtr doc = htmlParseDoc((unsigned char*)output.bp, NULL); |
---|
[4723] | 39 | |
---|
| 40 | /* Release tidy document */ |
---|
[4716] | 41 | tidyRelease( tdoc ); |
---|
| 42 | |
---|
[4723] | 43 | /* Convert libxml2's tree to our own DOM */ |
---|
[4716] | 44 | gDOM *dom = gaycko_convert_dom(doc); |
---|
| 45 | |
---|
[4723] | 46 | /* Free libxml2 tree */ |
---|
| 47 | xmlFreeDoc(doc); |
---|
| 48 | |
---|
[4716] | 49 | return dom; |
---|
| 50 | } |
---|
| 51 | |
---|