123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382 |
- <?php
- /**
- * base include file for SimpleTest
- * @package SimpleTest
- * @subpackage WebTester
- * @version $Id: php_parser.php 1911 2009-07-29 16:38:04Z lastcraft $
- */
- /**
- * Builds the page object.
- * @package SimpleTest
- * @subpackage WebTester
- */
- class SimpleTidyPageBuilder {
- private $page;
- private $forms = array();
- private $labels = array();
- private $widgets_by_id = array();
- public function __destruct() {
- $this->free();
- }
- /**
- * Frees up any references so as to allow the PHP garbage
- * collection from unset() to work.
- */
- private function free() {
- unset($this->page);
- $this->forms = array();
- $this->labels = array();
- }
- /**
- * This builder is only available if the 'tidy' extension is loaded.
- * @return boolean True if available.
- */
- function can() {
- return extension_loaded('tidy');
- }
- /**
- * Reads the raw content the page using HTML Tidy.
- * @param $response SimpleHttpResponse Fetched response.
- * @return SimplePage Newly parsed page.
- */
- function parse($response) {
- $this->page = new SimplePage($response);
- $tidied = tidy_parse_string($input = $this->insertGuards($response->getContent()),
- array('output-xml' => false, 'wrap' => '0', 'indent' => 'no'),
- 'latin1');
- $this->walkTree($tidied->html());
- $this->attachLabels($this->widgets_by_id, $this->labels);
- $this->page->setForms($this->forms);
- $page = $this->page;
- $this->free();
- return $page;
- }
- /**
- * Stops HTMLTidy stripping content that we wish to preserve.
- * @param string The raw html.
- * @return string The html with guard tags inserted.
- */
- private function insertGuards($html) {
- return $this->insertEmptyTagGuards($this->insertTextareaSimpleWhitespaceGuards($html));
- }
- /**
- * Removes the extra content added during the parse stage
- * in order to preserve content we don't want stripped
- * out by HTMLTidy.
- * @param string The raw html.
- * @return string The html with guard tags removed.
- */
- private function stripGuards($html) {
- return $this->stripTextareaWhitespaceGuards($this->stripEmptyTagGuards($html));
- }
- /**
- * HTML tidy strips out empty tags such as <option> which we
- * need to preserve. This method inserts an additional marker.
- * @param string The raw html.
- * @return string The html with guards inserted.
- */
- private function insertEmptyTagGuards($html) {
- return preg_replace('#<(option|textarea)([^>]*)>(\s*)</(option|textarea)>#is',
- '<\1\2>___EMPTY___\3</\4>',
- $html);
- }
- /**
- * HTML tidy strips out empty tags such as <option> which we
- * need to preserve. This method strips additional markers
- * inserted by SimpleTest to the tidy output used to make the
- * tags non-empty. This ensures their preservation.
- * @param string The raw html.
- * @return string The html with guards removed.
- */
- private function stripEmptyTagGuards($html) {
- return preg_replace('#(^|>)(\s*)___EMPTY___(\s*)(</|$)#i', '\2\3', $html);
- }
- /**
- * By parsing the XML output of tidy, we lose some whitespace
- * information in textarea tags. We temporarily recode this
- * data ourselves so as not to lose it.
- * @param string The raw html.
- * @return string The html with guards inserted.
- */
- private function insertTextareaSimpleWhitespaceGuards($html) {
- return preg_replace_callback('#<textarea([^>]*)>(.*?)</textarea>#is',
- array($this, 'insertWhitespaceGuards'),
- $html);
- }
- /**
- * Callback for insertTextareaSimpleWhitespaceGuards().
- * @param array $matches Result of preg_replace_callback().
- * @return string Guard tags now replace whitespace.
- */
- private function insertWhitespaceGuards($matches) {
- return '<textarea' . $matches[1] . '>' .
- str_replace(array("\n", "\r", "\t", ' '),
- array('___NEWLINE___', '___CR___', '___TAB___', '___SPACE___'),
- $matches[2]) .
- '</textarea>';
- }
- /**
- * Removes the whitespace preserving guards we added
- * before parsing.
- * @param string The raw html.
- * @return string The html with guards removed.
- */
- private function stripTextareaWhitespaceGuards($html) {
- return str_replace(array('___NEWLINE___', '___CR___', '___TAB___', '___SPACE___'),
- array("\n", "\r", "\t", ' '),
- $html);
- }
- /**
- * Visits the given node and all children
- * @param object $node Tidy XML node.
- */
- private function walkTree($node) {
- if ($node->name == 'a') {
- $this->page->addLink($this->tags()->createTag($node->name, (array)$node->attribute)
- ->addContent($this->innerHtml($node)));
- } elseif ($node->name == 'base' and isset($node->attribute['href'])) {
- $this->page->setBase($node->attribute['href']);
- } elseif ($node->name == 'title') {
- $this->page->setTitle($this->tags()->createTag($node->name, (array)$node->attribute)
- ->addContent($this->innerHtml($node)));
- } elseif ($node->name == 'frameset') {
- $this->page->setFrames($this->collectFrames($node));
- } elseif ($node->name == 'form') {
- $this->forms[] = $this->walkForm($node, $this->createEmptyForm($node));
- } elseif ($node->name == 'label') {
- $this->labels[] = $this->tags()->createTag($node->name, (array)$node->attribute)
- ->addContent($this->innerHtml($node));
- } else {
- $this->walkChildren($node);
- }
- }
- /**
- * Helper method for traversing the XML tree.
- * @param object $node Tidy XML node.
- */
- private function walkChildren($node) {
- if ($node->hasChildren()) {
- foreach ($node->child as $child) {
- $this->walkTree($child);
- }
- }
- }
- /**
- * Facade for forms containing preparsed widgets.
- * @param object $node Tidy XML node.
- * @return SimpleForm Facade for SimpleBrowser.
- */
- private function createEmptyForm($node) {
- return new SimpleForm($this->tags()->createTag($node->name, (array)$node->attribute), $this->page);
- }
- /**
- * Visits the given node and all children
- * @param object $node Tidy XML node.
- */
- private function walkForm($node, $form, $enclosing_label = '') {
- if ($node->name == 'a') {
- $this->page->addLink($this->tags()->createTag($node->name, (array)$node->attribute)
- ->addContent($this->innerHtml($node)));
- } elseif (in_array($node->name, array('input', 'button', 'textarea', 'select'))) {
- $this->addWidgetToForm($node, $form, $enclosing_label);
- } elseif ($node->name == 'label') {
- $this->labels[] = $this->tags()->createTag($node->name, (array)$node->attribute)
- ->addContent($this->innerHtml($node));
- if ($node->hasChildren()) {
- foreach ($node->child as $child) {
- $this->walkForm($child, $form, SimplePage::normalise($this->innerHtml($node)));
- }
- }
- } elseif ($node->hasChildren()) {
- foreach ($node->child as $child) {
- $this->walkForm($child, $form);
- }
- }
- return $form;
- }
- /**
- * Tests a node for a "for" atribute. Used for
- * attaching labels.
- * @param object $node Tidy XML node.
- * @return boolean True if the "for" attribute exists.
- */
- private function hasFor($node) {
- return isset($node->attribute) and $node->attribute['for'];
- }
- /**
- * Adds the widget into the form container.
- * @param object $node Tidy XML node of widget.
- * @param SimpleForm $form Form to add it to.
- * @param string $enclosing_label The label of any label
- * tag we might be in.
- */
- private function addWidgetToForm($node, $form, $enclosing_label) {
- $widget = $this->tags()->createTag($node->name, $this->attributes($node));
- if (! $widget) {
- return;
- }
- $widget->setLabel($enclosing_label)
- ->addContent($this->innerHtml($node));
- if ($node->name == 'select') {
- $widget->addTags($this->collectSelectOptions($node));
- }
- $form->addWidget($widget);
- $this->indexWidgetById($widget);
- }
- /**
- * Fills the widget cache to speed up searching.
- * @param SimpleTag $widget Parsed widget to cache.
- */
- private function indexWidgetById($widget) {
- $id = $widget->getAttribute('id');
- if (! $id) {
- return;
- }
- if (! isset($this->widgets_by_id[$id])) {
- $this->widgets_by_id[$id] = array();
- }
- $this->widgets_by_id[$id][] = $widget;
- }
- /**
- * Parses the options from inside an XML select node.
- * @param object $node Tidy XML node.
- * @return array List of SimpleTag options.
- */
- private function collectSelectOptions($node) {
- $options = array();
- if ($node->name == 'option') {
- $options[] = $this->tags()->createTag($node->name, $this->attributes($node))
- ->addContent($this->innerHtml($node));
- }
- if ($node->hasChildren()) {
- foreach ($node->child as $child) {
- $options = array_merge($options, $this->collectSelectOptions($child));
- }
- }
- return $options;
- }
- /**
- * Convenience method for collecting all the attributes
- * of a tag. Not sure why Tidy does not have this.
- * @param object $node Tidy XML node.
- * @return array Hash of attribute strings.
- */
- private function attributes($node) {
- if (! preg_match('|<[^ ]+\s(.*?)/?>|s', $node->value, $first_tag_contents)) {
- return array();
- }
- $attributes = array();
- preg_match_all('/\S+\s*=\s*\'[^\']*\'|(\S+\s*=\s*"[^"]*")|([^ =]+\s*=\s*[^ "\']+?)|[^ "\']+/', $first_tag_contents[1], $matches);
- foreach($matches[0] as $unparsed) {
- $attributes = $this->mergeAttribute($attributes, $unparsed);
- }
- return $attributes;
- }
- /**
- * Overlay an attribute into the attributes hash.
- * @param array $attributes Current attribute list.
- * @param string $raw Raw attribute string with
- * both key and value.
- * @return array New attribute hash.
- */
- private function mergeAttribute($attributes, $raw) {
- $parts = explode('=', $raw);
- list($name, $value) = count($parts) == 1 ? array($parts[0], $parts[0]) : $parts;
- $attributes[trim($name)] = html_entity_decode($this->dequote(trim($value)), ENT_QUOTES);
- return $attributes;
- }
- /**
- * Remove start and end quotes.
- * @param string $quoted A quoted string.
- * @return string Quotes are gone.
- */
- private function dequote($quoted) {
- if (preg_match('/^(\'([^\']*)\'|"([^"]*)")$/', $quoted, $matches)) {
- return isset($matches[3]) ? $matches[3] : $matches[2];
- }
- return $quoted;
- }
- /**
- * Collects frame information inside a frameset tag.
- * @param object $node Tidy XML node.
- * @return array List of SimpleTag frame descriptions.
- */
- private function collectFrames($node) {
- $frames = array();
- if ($node->name == 'frame') {
- $frames = array($this->tags()->createTag($node->name, (array)$node->attribute));
- } else if ($node->hasChildren()) {
- $frames = array();
- foreach ($node->child as $child) {
- $frames = array_merge($frames, $this->collectFrames($child));
- }
- }
- return $frames;
- }
- /**
- * Extracts the XML node text.
- * @param object $node Tidy XML node.
- * @return string The text only.
- */
- private function innerHtml($node) {
- $raw = '';
- if ($node->hasChildren()) {
- foreach ($node->child as $child) {
- $raw .= $child->value;
- }
- }
- return $this->stripGuards($raw);
- }
- /**
- * Factory for parsed content holders.
- * @return SimpleTagBuilder Factory.
- */
- private function tags() {
- return new SimpleTagBuilder();
- }
- /**
- * Called at the end of a parse run. Attaches any
- * non-wrapping labels to their form elements.
- * @param array $widgets_by_id Cached SimpleTag hash.
- * @param array $labels SimpleTag label elements.
- */
- private function attachLabels($widgets_by_id, $labels) {
- foreach ($labels as $label) {
- $for = $label->getFor();
- if ($for and isset($widgets_by_id[$for])) {
- $text = $label->getText();
- foreach ($widgets_by_id[$for] as $widget) {
- $widget->setLabel($text);
- }
- }
- }
- }
- }
- ?>
|