1 | #include "dom.h" |
---|
2 | #include <string.h> |
---|
3 | #include "helpers/str.h" |
---|
4 | |
---|
5 | gDOM *gaycko_convert_dom(htmlDocPtr doc) { |
---|
6 | |
---|
7 | gDOM *dom = malloc(sizeof(gDOM)); |
---|
8 | dom->root = NULL; |
---|
9 | htmlNodePtr root = xmlDocGetRootElement(doc); |
---|
10 | gElement **r = NULL; |
---|
11 | gElement *ret = NULL; |
---|
12 | |
---|
13 | if(root != NULL) |
---|
14 | { |
---|
15 | ret = explore(r, root, 0, NULL); |
---|
16 | } |
---|
17 | |
---|
18 | dom->root = ret; |
---|
19 | pretty_print(ret); |
---|
20 | |
---|
21 | return dom; |
---|
22 | } |
---|
23 | |
---|
24 | |
---|
25 | gElement *add_element(htmlNodePtr node) { |
---|
26 | |
---|
27 | gElement *e; |
---|
28 | e = malloc(sizeof(gElement)); |
---|
29 | e->children_count = 0; |
---|
30 | e->children = NULL; |
---|
31 | e->attribute_count = 0; |
---|
32 | e->attributes = NULL; |
---|
33 | |
---|
34 | e->name = (char*)strdup((const char*)node->name); |
---|
35 | |
---|
36 | if(!strncmp(e->name, "html", 4)) { |
---|
37 | e->type = ELEM_HTML; |
---|
38 | }else if(!strncmp(e->name, "head", 4)) { |
---|
39 | e->type = ELEM_HEAD; |
---|
40 | }else if(!strncmp(e->name, "title", 5)) { |
---|
41 | e->type = ELEM_TITLE; |
---|
42 | }else if(!strncmp(e->name, "meta", 4)) { |
---|
43 | e->type = ELEM_META; |
---|
44 | }else if(!strncmp(e->name, "body", 4)) { |
---|
45 | e->type = ELEM_BODY; |
---|
46 | }else if(!strncmp(e->name, "h1", 2)) { |
---|
47 | e->type = ELEM_H1; |
---|
48 | }else if(!strncmp(e->name, "br", 2) || !strncmp(e->name, "br ", 3)) { |
---|
49 | e->type = ELEM_BR; |
---|
50 | }else if(!strncmp(e->name, "p", 1)) { |
---|
51 | e->type = ELEM_P; |
---|
52 | }else if(!strncmp(e->name, "img", 3)) { |
---|
53 | e->type = ELEM_IMG; |
---|
54 | }else if(!strncmp(e->name, "text", 4)) { |
---|
55 | e->type = ELEM_TEXT; |
---|
56 | e->text = (char*)strdup((const char*)xmlNodeGetContent(node)); |
---|
57 | strip_eol(e->text); |
---|
58 | strip_spaces(e->text); |
---|
59 | }else { |
---|
60 | e->type = ELEM_UNKNOW; |
---|
61 | printf("Unknow tag '%s'\n", e->name); |
---|
62 | } |
---|
63 | |
---|
64 | if(node->properties) { |
---|
65 | for(xmlAttrPtr attr = node->properties; attr != NULL; attr = attr->next) { |
---|
66 | e->attributes = realloc(e->attributes, sizeof(gAttribute)); |
---|
67 | e->attributes[e->attribute_count].name = strdup(attr->name); |
---|
68 | e->attributes[e->attribute_count].value = strdup(attr->children->content); |
---|
69 | } |
---|
70 | } |
---|
71 | |
---|
72 | return e; |
---|
73 | } |
---|
74 | |
---|
75 | |
---|
76 | gElement* explore(gElement **elem, htmlNodePtr element, unsigned int level, gElement *parent) |
---|
77 | { |
---|
78 | unsigned int c = 0; |
---|
79 | for(htmlNodePtr node = element; node != NULL; node = node->next) |
---|
80 | { |
---|
81 | gElement *child = add_element(node); |
---|
82 | child->level = level; |
---|
83 | |
---|
84 | if(elem) elem = realloc(elem, sizeof(gElement*)*(c+1)); |
---|
85 | else elem = malloc(sizeof(gElement*)); |
---|
86 | |
---|
87 | |
---|
88 | elem[c] = child; |
---|
89 | |
---|
90 | c++; |
---|
91 | |
---|
92 | if(node->type == XML_ELEMENT_NODE) |
---|
93 | { |
---|
94 | if(node->children != NULL) { |
---|
95 | explore(child->children, |
---|
96 | node->children, |
---|
97 | level+1, |
---|
98 | child); |
---|
99 | } else { |
---|
100 | free(child->children); |
---|
101 | child->children = NULL; |
---|
102 | } |
---|
103 | } |
---|
104 | if(parent) { |
---|
105 | parent->children = realloc(parent->children, sizeof(gElement*)*(parent->children_count+1)); |
---|
106 | parent->children[parent->children_count] = child; |
---|
107 | parent->children_count++; |
---|
108 | } |
---|
109 | } |
---|
110 | return *elem; |
---|
111 | } |
---|
112 | |
---|
113 | |
---|
114 | #define LEVEL {unsigned int foo=0; for(foo=0; foo<elem->level; foo++) printf(" ");} |
---|
115 | |
---|
116 | void pretty_print(gElement *elem) { |
---|
117 | if(!elem) { |
---|
118 | printf("elem is %p\n", elem); |
---|
119 | return; |
---|
120 | } |
---|
121 | |
---|
122 | if(elem->type == ELEM_TEXT) { |
---|
123 | LEVEL printf("%s\n", elem->text); |
---|
124 | } else { |
---|
125 | LEVEL printf("<%s>\n", elem->name); |
---|
126 | } |
---|
127 | |
---|
128 | unsigned int i; |
---|
129 | for(i=0; i < elem->children_count; i++) { |
---|
130 | pretty_print(elem->children[i]); |
---|
131 | } |
---|
132 | |
---|
133 | if(elem->type == ELEM_TEXT) { |
---|
134 | |
---|
135 | } else { |
---|
136 | LEVEL printf("</%s>\n", elem->name); |
---|
137 | } |
---|
138 | } |
---|
139 | |
---|
140 | |
---|
141 | #if 0 |
---|
142 | if(xmlStrcasecmp(node->name, (const xmlChar*)"A") == 0) |
---|
143 | { |
---|
144 | for(xmlAttrPtr attr = node->properties; attr != NULL; attr = attr->next) |
---|
145 | { |
---|
146 | if(xmlStrcasecmp(attr->name, (const xmlChar*)"HREF") == 0) |
---|
147 | { |
---|
148 | printf("Found link <%s>\n", node->children->content); |
---|
149 | } |
---|
150 | } |
---|
151 | } |
---|
152 | #endif |
---|
153 | |
---|