tidy_parser.php 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382
  1. <?php
  2. /**
  3. * base include file for SimpleTest
  4. * @package SimpleTest
  5. * @subpackage WebTester
  6. * @version $Id: php_parser.php 1911 2009-07-29 16:38:04Z lastcraft $
  7. */
  8. /**
  9. * Builds the page object.
  10. * @package SimpleTest
  11. * @subpackage WebTester
  12. */
  13. class SimpleTidyPageBuilder {
  14. private $page;
  15. private $forms = array();
  16. private $labels = array();
  17. private $widgets_by_id = array();
  18. public function __destruct() {
  19. $this->free();
  20. }
  21. /**
  22. * Frees up any references so as to allow the PHP garbage
  23. * collection from unset() to work.
  24. */
  25. private function free() {
  26. unset($this->page);
  27. $this->forms = array();
  28. $this->labels = array();
  29. }
  30. /**
  31. * This builder is only available if the 'tidy' extension is loaded.
  32. * @return boolean True if available.
  33. */
  34. function can() {
  35. return extension_loaded('tidy');
  36. }
  37. /**
  38. * Reads the raw content the page using HTML Tidy.
  39. * @param $response SimpleHttpResponse Fetched response.
  40. * @return SimplePage Newly parsed page.
  41. */
  42. function parse($response) {
  43. $this->page = new SimplePage($response);
  44. $tidied = tidy_parse_string($input = $this->insertGuards($response->getContent()),
  45. array('output-xml' => false, 'wrap' => '0', 'indent' => 'no'),
  46. 'latin1');
  47. $this->walkTree($tidied->html());
  48. $this->attachLabels($this->widgets_by_id, $this->labels);
  49. $this->page->setForms($this->forms);
  50. $page = $this->page;
  51. $this->free();
  52. return $page;
  53. }
  54. /**
  55. * Stops HTMLTidy stripping content that we wish to preserve.
  56. * @param string The raw html.
  57. * @return string The html with guard tags inserted.
  58. */
  59. private function insertGuards($html) {
  60. return $this->insertEmptyTagGuards($this->insertTextareaSimpleWhitespaceGuards($html));
  61. }
  62. /**
  63. * Removes the extra content added during the parse stage
  64. * in order to preserve content we don't want stripped
  65. * out by HTMLTidy.
  66. * @param string The raw html.
  67. * @return string The html with guard tags removed.
  68. */
  69. private function stripGuards($html) {
  70. return $this->stripTextareaWhitespaceGuards($this->stripEmptyTagGuards($html));
  71. }
  72. /**
  73. * HTML tidy strips out empty tags such as <option> which we
  74. * need to preserve. This method inserts an additional marker.
  75. * @param string The raw html.
  76. * @return string The html with guards inserted.
  77. */
  78. private function insertEmptyTagGuards($html) {
  79. return preg_replace('#<(option|textarea)([^>]*)>(\s*)</(option|textarea)>#is',
  80. '<\1\2>___EMPTY___\3</\4>',
  81. $html);
  82. }
  83. /**
  84. * HTML tidy strips out empty tags such as <option> which we
  85. * need to preserve. This method strips additional markers
  86. * inserted by SimpleTest to the tidy output used to make the
  87. * tags non-empty. This ensures their preservation.
  88. * @param string The raw html.
  89. * @return string The html with guards removed.
  90. */
  91. private function stripEmptyTagGuards($html) {
  92. return preg_replace('#(^|>)(\s*)___EMPTY___(\s*)(</|$)#i', '\2\3', $html);
  93. }
  94. /**
  95. * By parsing the XML output of tidy, we lose some whitespace
  96. * information in textarea tags. We temporarily recode this
  97. * data ourselves so as not to lose it.
  98. * @param string The raw html.
  99. * @return string The html with guards inserted.
  100. */
  101. private function insertTextareaSimpleWhitespaceGuards($html) {
  102. return preg_replace_callback('#<textarea([^>]*)>(.*?)</textarea>#is',
  103. array($this, 'insertWhitespaceGuards'),
  104. $html);
  105. }
  106. /**
  107. * Callback for insertTextareaSimpleWhitespaceGuards().
  108. * @param array $matches Result of preg_replace_callback().
  109. * @return string Guard tags now replace whitespace.
  110. */
  111. private function insertWhitespaceGuards($matches) {
  112. return '<textarea' . $matches[1] . '>' .
  113. str_replace(array("\n", "\r", "\t", ' '),
  114. array('___NEWLINE___', '___CR___', '___TAB___', '___SPACE___'),
  115. $matches[2]) .
  116. '</textarea>';
  117. }
  118. /**
  119. * Removes the whitespace preserving guards we added
  120. * before parsing.
  121. * @param string The raw html.
  122. * @return string The html with guards removed.
  123. */
  124. private function stripTextareaWhitespaceGuards($html) {
  125. return str_replace(array('___NEWLINE___', '___CR___', '___TAB___', '___SPACE___'),
  126. array("\n", "\r", "\t", ' '),
  127. $html);
  128. }
  129. /**
  130. * Visits the given node and all children
  131. * @param object $node Tidy XML node.
  132. */
  133. private function walkTree($node) {
  134. if ($node->name == 'a') {
  135. $this->page->addLink($this->tags()->createTag($node->name, (array)$node->attribute)
  136. ->addContent($this->innerHtml($node)));
  137. } elseif ($node->name == 'base' and isset($node->attribute['href'])) {
  138. $this->page->setBase($node->attribute['href']);
  139. } elseif ($node->name == 'title') {
  140. $this->page->setTitle($this->tags()->createTag($node->name, (array)$node->attribute)
  141. ->addContent($this->innerHtml($node)));
  142. } elseif ($node->name == 'frameset') {
  143. $this->page->setFrames($this->collectFrames($node));
  144. } elseif ($node->name == 'form') {
  145. $this->forms[] = $this->walkForm($node, $this->createEmptyForm($node));
  146. } elseif ($node->name == 'label') {
  147. $this->labels[] = $this->tags()->createTag($node->name, (array)$node->attribute)
  148. ->addContent($this->innerHtml($node));
  149. } else {
  150. $this->walkChildren($node);
  151. }
  152. }
  153. /**
  154. * Helper method for traversing the XML tree.
  155. * @param object $node Tidy XML node.
  156. */
  157. private function walkChildren($node) {
  158. if ($node->hasChildren()) {
  159. foreach ($node->child as $child) {
  160. $this->walkTree($child);
  161. }
  162. }
  163. }
  164. /**
  165. * Facade for forms containing preparsed widgets.
  166. * @param object $node Tidy XML node.
  167. * @return SimpleForm Facade for SimpleBrowser.
  168. */
  169. private function createEmptyForm($node) {
  170. return new SimpleForm($this->tags()->createTag($node->name, (array)$node->attribute), $this->page);
  171. }
  172. /**
  173. * Visits the given node and all children
  174. * @param object $node Tidy XML node.
  175. */
  176. private function walkForm($node, $form, $enclosing_label = '') {
  177. if ($node->name == 'a') {
  178. $this->page->addLink($this->tags()->createTag($node->name, (array)$node->attribute)
  179. ->addContent($this->innerHtml($node)));
  180. } elseif (in_array($node->name, array('input', 'button', 'textarea', 'select'))) {
  181. $this->addWidgetToForm($node, $form, $enclosing_label);
  182. } elseif ($node->name == 'label') {
  183. $this->labels[] = $this->tags()->createTag($node->name, (array)$node->attribute)
  184. ->addContent($this->innerHtml($node));
  185. if ($node->hasChildren()) {
  186. foreach ($node->child as $child) {
  187. $this->walkForm($child, $form, SimplePage::normalise($this->innerHtml($node)));
  188. }
  189. }
  190. } elseif ($node->hasChildren()) {
  191. foreach ($node->child as $child) {
  192. $this->walkForm($child, $form);
  193. }
  194. }
  195. return $form;
  196. }
  197. /**
  198. * Tests a node for a "for" atribute. Used for
  199. * attaching labels.
  200. * @param object $node Tidy XML node.
  201. * @return boolean True if the "for" attribute exists.
  202. */
  203. private function hasFor($node) {
  204. return isset($node->attribute) and $node->attribute['for'];
  205. }
  206. /**
  207. * Adds the widget into the form container.
  208. * @param object $node Tidy XML node of widget.
  209. * @param SimpleForm $form Form to add it to.
  210. * @param string $enclosing_label The label of any label
  211. * tag we might be in.
  212. */
  213. private function addWidgetToForm($node, $form, $enclosing_label) {
  214. $widget = $this->tags()->createTag($node->name, $this->attributes($node));
  215. if (! $widget) {
  216. return;
  217. }
  218. $widget->setLabel($enclosing_label)
  219. ->addContent($this->innerHtml($node));
  220. if ($node->name == 'select') {
  221. $widget->addTags($this->collectSelectOptions($node));
  222. }
  223. $form->addWidget($widget);
  224. $this->indexWidgetById($widget);
  225. }
  226. /**
  227. * Fills the widget cache to speed up searching.
  228. * @param SimpleTag $widget Parsed widget to cache.
  229. */
  230. private function indexWidgetById($widget) {
  231. $id = $widget->getAttribute('id');
  232. if (! $id) {
  233. return;
  234. }
  235. if (! isset($this->widgets_by_id[$id])) {
  236. $this->widgets_by_id[$id] = array();
  237. }
  238. $this->widgets_by_id[$id][] = $widget;
  239. }
  240. /**
  241. * Parses the options from inside an XML select node.
  242. * @param object $node Tidy XML node.
  243. * @return array List of SimpleTag options.
  244. */
  245. private function collectSelectOptions($node) {
  246. $options = array();
  247. if ($node->name == 'option') {
  248. $options[] = $this->tags()->createTag($node->name, $this->attributes($node))
  249. ->addContent($this->innerHtml($node));
  250. }
  251. if ($node->hasChildren()) {
  252. foreach ($node->child as $child) {
  253. $options = array_merge($options, $this->collectSelectOptions($child));
  254. }
  255. }
  256. return $options;
  257. }
  258. /**
  259. * Convenience method for collecting all the attributes
  260. * of a tag. Not sure why Tidy does not have this.
  261. * @param object $node Tidy XML node.
  262. * @return array Hash of attribute strings.
  263. */
  264. private function attributes($node) {
  265. if (! preg_match('|<[^ ]+\s(.*?)/?>|s', $node->value, $first_tag_contents)) {
  266. return array();
  267. }
  268. $attributes = array();
  269. preg_match_all('/\S+\s*=\s*\'[^\']*\'|(\S+\s*=\s*"[^"]*")|([^ =]+\s*=\s*[^ "\']+?)|[^ "\']+/', $first_tag_contents[1], $matches);
  270. foreach($matches[0] as $unparsed) {
  271. $attributes = $this->mergeAttribute($attributes, $unparsed);
  272. }
  273. return $attributes;
  274. }
  275. /**
  276. * Overlay an attribute into the attributes hash.
  277. * @param array $attributes Current attribute list.
  278. * @param string $raw Raw attribute string with
  279. * both key and value.
  280. * @return array New attribute hash.
  281. */
  282. private function mergeAttribute($attributes, $raw) {
  283. $parts = explode('=', $raw);
  284. list($name, $value) = count($parts) == 1 ? array($parts[0], $parts[0]) : $parts;
  285. $attributes[trim($name)] = html_entity_decode($this->dequote(trim($value)), ENT_QUOTES);
  286. return $attributes;
  287. }
  288. /**
  289. * Remove start and end quotes.
  290. * @param string $quoted A quoted string.
  291. * @return string Quotes are gone.
  292. */
  293. private function dequote($quoted) {
  294. if (preg_match('/^(\'([^\']*)\'|"([^"]*)")$/', $quoted, $matches)) {
  295. return isset($matches[3]) ? $matches[3] : $matches[2];
  296. }
  297. return $quoted;
  298. }
  299. /**
  300. * Collects frame information inside a frameset tag.
  301. * @param object $node Tidy XML node.
  302. * @return array List of SimpleTag frame descriptions.
  303. */
  304. private function collectFrames($node) {
  305. $frames = array();
  306. if ($node->name == 'frame') {
  307. $frames = array($this->tags()->createTag($node->name, (array)$node->attribute));
  308. } else if ($node->hasChildren()) {
  309. $frames = array();
  310. foreach ($node->child as $child) {
  311. $frames = array_merge($frames, $this->collectFrames($child));
  312. }
  313. }
  314. return $frames;
  315. }
  316. /**
  317. * Extracts the XML node text.
  318. * @param object $node Tidy XML node.
  319. * @return string The text only.
  320. */
  321. private function innerHtml($node) {
  322. $raw = '';
  323. if ($node->hasChildren()) {
  324. foreach ($node->child as $child) {
  325. $raw .= $child->value;
  326. }
  327. }
  328. return $this->stripGuards($raw);
  329. }
  330. /**
  331. * Factory for parsed content holders.
  332. * @return SimpleTagBuilder Factory.
  333. */
  334. private function tags() {
  335. return new SimpleTagBuilder();
  336. }
  337. /**
  338. * Called at the end of a parse run. Attaches any
  339. * non-wrapping labels to their form elements.
  340. * @param array $widgets_by_id Cached SimpleTag hash.
  341. * @param array $labels SimpleTag label elements.
  342. */
  343. private function attachLabels($widgets_by_id, $labels) {
  344. foreach ($labels as $label) {
  345. $for = $label->getFor();
  346. if ($for and isset($widgets_by_id[$for])) {
  347. $text = $label->getText();
  348. foreach ($widgets_by_id[$for] as $widget) {
  349. $widget->setLabel($text);
  350. }
  351. }
  352. }
  353. }
  354. }
  355. ?>