url.php 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550
  1. <?php
  2. /**
  3. * base include file for SimpleTest
  4. * @package SimpleTest
  5. * @subpackage WebTester
  6. * @version $Id: url.php 2011 2011-04-29 08:22:48Z pp11 $
  7. */
  8. /**#@+
  9. * include other SimpleTest class files
  10. */
  11. require_once(dirname(__FILE__) . '/encoding.php');
  12. /**#@-*/
  13. /**
  14. * URL parser to replace parse_url() PHP function which
  15. * got broken in PHP 4.3.0. Adds some browser specific
  16. * functionality such as expandomatics.
  17. * Guesses a bit trying to separate the host from
  18. * the path and tries to keep a raw, possibly unparsable,
  19. * request string as long as possible.
  20. * @package SimpleTest
  21. * @subpackage WebTester
  22. */
  23. class SimpleUrl {
  24. private $scheme;
  25. private $username;
  26. private $password;
  27. private $host;
  28. private $port;
  29. public $path;
  30. private $request;
  31. private $fragment;
  32. private $x;
  33. private $y;
  34. private $target;
  35. private $raw = false;
  36. /**
  37. * Constructor. Parses URL into sections.
  38. * @param string $url Incoming URL.
  39. * @access public
  40. */
  41. function __construct($url = '') {
  42. list($x, $y) = $this->chompCoordinates($url);
  43. $this->setCoordinates($x, $y);
  44. $this->scheme = $this->chompScheme($url);
  45. if ($this->scheme === 'file') {
  46. // Unescaped backslashes not used in directory separator context
  47. // will get caught by this, but they should have been urlencoded
  48. // anyway so we don't care. If this ends up being a problem, the
  49. // host regexp must be modified to match for backslashes when
  50. // the scheme is file.
  51. $url = str_replace('\\', '/', $url);
  52. }
  53. list($this->username, $this->password) = $this->chompLogin($url);
  54. $this->host = $this->chompHost($url);
  55. $this->port = false;
  56. if (preg_match('/(.*?):(.*)/', $this->host, $host_parts)) {
  57. if ($this->scheme === 'file' && strlen($this->host) === 2) {
  58. // DOS drive was placed in authority; promote it to path.
  59. $url = '/' . $this->host . $url;
  60. $this->host = false;
  61. } else {
  62. $this->host = $host_parts[1];
  63. $this->port = (integer)$host_parts[2];
  64. }
  65. }
  66. $this->path = $this->chompPath($url);
  67. $this->request = $this->parseRequest($this->chompRequest($url));
  68. $this->fragment = (strncmp($url, "#", 1) == 0 ? substr($url, 1) : false);
  69. $this->target = false;
  70. }
  71. /**
  72. * Extracts the X, Y coordinate pair from an image map.
  73. * @param string $url URL so far. The coordinates will be
  74. * removed.
  75. * @return array X, Y as a pair of integers.
  76. * @access private
  77. */
  78. protected function chompCoordinates(&$url) {
  79. if (preg_match('/(.*)\?(\d+),(\d+)$/', $url, $matches)) {
  80. $url = $matches[1];
  81. return array((integer)$matches[2], (integer)$matches[3]);
  82. }
  83. return array(false, false);
  84. }
  85. /**
  86. * Extracts the scheme part of an incoming URL.
  87. * @param string $url URL so far. The scheme will be
  88. * removed.
  89. * @return string Scheme part or false.
  90. * @access private
  91. */
  92. protected function chompScheme(&$url) {
  93. if (preg_match('#^([^/:]*):(//)(.*)#', $url, $matches)) {
  94. $url = $matches[2] . $matches[3];
  95. return $matches[1];
  96. }
  97. return false;
  98. }
  99. /**
  100. * Extracts the username and password from the
  101. * incoming URL. The // prefix will be reattached
  102. * to the URL after the doublet is extracted.
  103. * @param string $url URL so far. The username and
  104. * password are removed.
  105. * @return array Two item list of username and
  106. * password. Will urldecode() them.
  107. * @access private
  108. */
  109. protected function chompLogin(&$url) {
  110. $prefix = '';
  111. if (preg_match('#^(//)(.*)#', $url, $matches)) {
  112. $prefix = $matches[1];
  113. $url = $matches[2];
  114. }
  115. if (preg_match('#^([^/]*)@(.*)#', $url, $matches)) {
  116. $url = $prefix . $matches[2];
  117. $parts = explode(":", $matches[1]);
  118. return array(
  119. urldecode($parts[0]),
  120. isset($parts[1]) ? urldecode($parts[1]) : false);
  121. }
  122. $url = $prefix . $url;
  123. return array(false, false);
  124. }
  125. /**
  126. * Extracts the host part of an incoming URL.
  127. * Includes the port number part. Will extract
  128. * the host if it starts with // or it has
  129. * a top level domain or it has at least two
  130. * dots.
  131. * @param string $url URL so far. The host will be
  132. * removed.
  133. * @return string Host part guess or false.
  134. * @access private
  135. */
  136. protected function chompHost(&$url) {
  137. if (preg_match('!^(//)(.*?)(/.*|\?.*|#.*|$)!', $url, $matches)) {
  138. $url = $matches[3];
  139. return $matches[2];
  140. }
  141. if (preg_match('!(.*?)(\.\./|\./|/|\?|#|$)(.*)!', $url, $matches)) {
  142. $tlds = SimpleUrl::getAllTopLevelDomains();
  143. if (preg_match('/[a-z0-9\-]+\.(' . $tlds . ')/i', $matches[1])) {
  144. $url = $matches[2] . $matches[3];
  145. return $matches[1];
  146. } elseif (preg_match('/[a-z0-9\-]+\.[a-z0-9\-]+\.[a-z0-9\-]+/i', $matches[1])) {
  147. $url = $matches[2] . $matches[3];
  148. return $matches[1];
  149. }
  150. }
  151. return false;
  152. }
  153. /**
  154. * Extracts the path information from the incoming
  155. * URL. Strips this path from the URL.
  156. * @param string $url URL so far. The host will be
  157. * removed.
  158. * @return string Path part or '/'.
  159. * @access private
  160. */
  161. protected function chompPath(&$url) {
  162. if (preg_match('/(.*?)(\?|#|$)(.*)/', $url, $matches)) {
  163. $url = $matches[2] . $matches[3];
  164. return ($matches[1] ? $matches[1] : '');
  165. }
  166. return '';
  167. }
  168. /**
  169. * Strips off the request data.
  170. * @param string $url URL so far. The request will be
  171. * removed.
  172. * @return string Raw request part.
  173. * @access private
  174. */
  175. protected function chompRequest(&$url) {
  176. if (preg_match('/\?(.*?)(#|$)(.*)/', $url, $matches)) {
  177. $url = $matches[2] . $matches[3];
  178. return $matches[1];
  179. }
  180. return '';
  181. }
  182. /**
  183. * Breaks the request down into an object.
  184. * @param string $raw Raw request.
  185. * @return SimpleFormEncoding Parsed data.
  186. * @access private
  187. */
  188. protected function parseRequest($raw) {
  189. $this->raw = $raw;
  190. $request = new SimpleGetEncoding();
  191. foreach (explode("&", $raw) as $pair) {
  192. if (preg_match('/(.*?)=(.*)/', $pair, $matches)) {
  193. $request->add(urldecode($matches[1]), urldecode($matches[2]));
  194. } elseif ($pair) {
  195. $request->add(urldecode($pair), '');
  196. }
  197. }
  198. return $request;
  199. }
  200. /**
  201. * Accessor for protocol part.
  202. * @param string $default Value to use if not present.
  203. * @return string Scheme name, e.g "http".
  204. * @access public
  205. */
  206. function getScheme($default = false) {
  207. return $this->scheme ? $this->scheme : $default;
  208. }
  209. /**
  210. * Accessor for user name.
  211. * @return string Username preceding host.
  212. * @access public
  213. */
  214. function getUsername() {
  215. return $this->username;
  216. }
  217. /**
  218. * Accessor for password.
  219. * @return string Password preceding host.
  220. * @access public
  221. */
  222. function getPassword() {
  223. return $this->password;
  224. }
  225. /**
  226. * Accessor for hostname and port.
  227. * @param string $default Value to use if not present.
  228. * @return string Hostname only.
  229. * @access public
  230. */
  231. function getHost($default = false) {
  232. return $this->host ? $this->host : $default;
  233. }
  234. /**
  235. * Accessor for top level domain.
  236. * @return string Last part of host.
  237. * @access public
  238. */
  239. function getTld() {
  240. $path_parts = pathinfo($this->getHost());
  241. return (isset($path_parts['extension']) ? $path_parts['extension'] : false);
  242. }
  243. /**
  244. * Accessor for port number.
  245. * @return integer TCP/IP port number.
  246. * @access public
  247. */
  248. function getPort() {
  249. return $this->port;
  250. }
  251. /**
  252. * Accessor for path.
  253. * @return string Full path including leading slash if implied.
  254. * @access public
  255. */
  256. function getPath() {
  257. if (! $this->path && $this->host) {
  258. return '/';
  259. }
  260. return $this->path;
  261. }
  262. /**
  263. * Accessor for page if any. This may be a
  264. * directory name if ambiguious.
  265. * @return Page name.
  266. * @access public
  267. */
  268. function getPage() {
  269. if (! preg_match('/([^\/]*?)$/', $this->getPath(), $matches)) {
  270. return false;
  271. }
  272. return $matches[1];
  273. }
  274. /**
  275. * Gets the path to the page.
  276. * @return string Path less the page.
  277. * @access public
  278. */
  279. function getBasePath() {
  280. if (! preg_match('/(.*\/)[^\/]*?$/', $this->getPath(), $matches)) {
  281. return false;
  282. }
  283. return $matches[1];
  284. }
  285. /**
  286. * Accessor for fragment at end of URL after the "#".
  287. * @return string Part after "#".
  288. * @access public
  289. */
  290. function getFragment() {
  291. return $this->fragment;
  292. }
  293. /**
  294. * Sets image coordinates. Set to false to clear
  295. * them.
  296. * @param integer $x Horizontal position.
  297. * @param integer $y Vertical position.
  298. * @access public
  299. */
  300. function setCoordinates($x = false, $y = false) {
  301. if (($x === false) || ($y === false)) {
  302. $this->x = $this->y = false;
  303. return;
  304. }
  305. $this->x = (integer)$x;
  306. $this->y = (integer)$y;
  307. }
  308. /**
  309. * Accessor for horizontal image coordinate.
  310. * @return integer X value.
  311. * @access public
  312. */
  313. function getX() {
  314. return $this->x;
  315. }
  316. /**
  317. * Accessor for vertical image coordinate.
  318. * @return integer Y value.
  319. * @access public
  320. */
  321. function getY() {
  322. return $this->y;
  323. }
  324. /**
  325. * Accessor for current request parameters
  326. * in URL string form. Will return teh original request
  327. * if at all possible even if it doesn't make much
  328. * sense.
  329. * @return string Form is string "?a=1&b=2", etc.
  330. * @access public
  331. */
  332. function getEncodedRequest() {
  333. if ($this->raw) {
  334. $encoded = $this->raw;
  335. } else {
  336. $encoded = $this->request->asUrlRequest();
  337. }
  338. if ($encoded) {
  339. return '?' . preg_replace('/^\?/', '', $encoded);
  340. }
  341. return '';
  342. }
  343. /**
  344. * Adds an additional parameter to the request.
  345. * @param string $key Name of parameter.
  346. * @param string $value Value as string.
  347. * @access public
  348. */
  349. function addRequestParameter($key, $value) {
  350. $this->raw = false;
  351. $this->request->add($key, $value);
  352. }
  353. /**
  354. * Adds additional parameters to the request.
  355. * @param hash/SimpleFormEncoding $parameters Additional
  356. * parameters.
  357. * @access public
  358. */
  359. function addRequestParameters($parameters) {
  360. $this->raw = false;
  361. $this->request->merge($parameters);
  362. }
  363. /**
  364. * Clears down all parameters.
  365. * @access public
  366. */
  367. function clearRequest() {
  368. $this->raw = false;
  369. $this->request = new SimpleGetEncoding();
  370. }
  371. /**
  372. * Gets the frame target if present. Although
  373. * not strictly part of the URL specification it
  374. * acts as similarily to the browser.
  375. * @return boolean/string Frame name or false if none.
  376. * @access public
  377. */
  378. function getTarget() {
  379. return $this->target;
  380. }
  381. /**
  382. * Attaches a frame target.
  383. * @param string $frame Name of frame.
  384. * @access public
  385. */
  386. function setTarget($frame) {
  387. $this->raw = false;
  388. $this->target = $frame;
  389. }
  390. /**
  391. * Renders the URL back into a string.
  392. * @return string URL in canonical form.
  393. * @access public
  394. */
  395. function asString() {
  396. $path = $this->path;
  397. $scheme = $identity = $host = $port = $encoded = $fragment = '';
  398. if ($this->username && $this->password) {
  399. $identity = $this->username . ':' . $this->password . '@';
  400. }
  401. if ($this->getHost()) {
  402. $scheme = $this->getScheme() ? $this->getScheme() : 'http';
  403. $scheme .= '://';
  404. $host = $this->getHost();
  405. } elseif ($this->getScheme() === 'file') {
  406. // Safest way; otherwise, file URLs on Windows have an extra
  407. // leading slash. It might be possible to convert file://
  408. // URIs to local file paths, but that requires more research.
  409. $scheme = 'file://';
  410. }
  411. if ($this->getPort() && $this->getPort() != 80 ) {
  412. $port = ':'.$this->getPort();
  413. }
  414. if (substr($this->path, 0, 1) == '/') {
  415. $path = $this->normalisePath($this->path);
  416. }
  417. $encoded = $this->getEncodedRequest();
  418. $fragment = $this->getFragment() ? '#'. $this->getFragment() : '';
  419. $coords = $this->getX() === false ? '' : '?' . $this->getX() . ',' . $this->getY();
  420. return "$scheme$identity$host$port$path$encoded$fragment$coords";
  421. }
  422. /**
  423. * Replaces unknown sections to turn a relative
  424. * URL into an absolute one. The base URL can
  425. * be either a string or a SimpleUrl object.
  426. * @param string/SimpleUrl $base Base URL.
  427. * @access public
  428. */
  429. function makeAbsolute($base) {
  430. if (! is_object($base)) {
  431. $base = new SimpleUrl($base);
  432. }
  433. if ($this->getHost()) {
  434. $scheme = $this->getScheme();
  435. $host = $this->getHost();
  436. $port = $this->getPort() ? ':' . $this->getPort() : '';
  437. $identity = $this->getIdentity() ? $this->getIdentity() . '@' : '';
  438. if (! $identity) {
  439. $identity = $base->getIdentity() ? $base->getIdentity() . '@' : '';
  440. }
  441. } else {
  442. $scheme = $base->getScheme();
  443. $host = $base->getHost();
  444. $port = $base->getPort() ? ':' . $base->getPort() : '';
  445. $identity = $base->getIdentity() ? $base->getIdentity() . '@' : '';
  446. }
  447. $path = $this->normalisePath($this->extractAbsolutePath($base));
  448. $encoded = $this->getEncodedRequest();
  449. $fragment = $this->getFragment() ? '#'. $this->getFragment() : '';
  450. $coords = $this->getX() === false ? '' : '?' . $this->getX() . ',' . $this->getY();
  451. return new SimpleUrl("$scheme://$identity$host$port$path$encoded$fragment$coords");
  452. }
  453. /**
  454. * Replaces unknown sections of the path with base parts
  455. * to return a complete absolute one.
  456. * @param string/SimpleUrl $base Base URL.
  457. * @param string Absolute path.
  458. * @access private
  459. */
  460. protected function extractAbsolutePath($base) {
  461. if ($this->getHost()) {
  462. return $this->path;
  463. }
  464. if (! $this->isRelativePath($this->path)) {
  465. return $this->path;
  466. }
  467. if ($this->path) {
  468. return $base->getBasePath() . $this->path;
  469. }
  470. return $base->getPath();
  471. }
  472. /**
  473. * Simple test to see if a path part is relative.
  474. * @param string $path Path to test.
  475. * @return boolean True if starts with a "/".
  476. * @access private
  477. */
  478. protected function isRelativePath($path) {
  479. return (substr($path, 0, 1) != '/');
  480. }
  481. /**
  482. * Extracts the username and password for use in rendering
  483. * a URL.
  484. * @return string/boolean Form of username:password or false.
  485. * @access public
  486. */
  487. function getIdentity() {
  488. if ($this->username && $this->password) {
  489. return $this->username . ':' . $this->password;
  490. }
  491. return false;
  492. }
  493. /**
  494. * Replaces . and .. sections of the path.
  495. * @param string $path Unoptimised path.
  496. * @return string Path with dots removed if possible.
  497. * @access public
  498. */
  499. function normalisePath($path) {
  500. $path = preg_replace('|/\./|', '/', $path);
  501. return preg_replace('|/[^/]+/\.\./|', '/', $path);
  502. }
  503. /**
  504. * A pipe seperated list of all TLDs that result in two part
  505. * domain names.
  506. * @return string Pipe separated list.
  507. * @access public
  508. */
  509. static function getAllTopLevelDomains() {
  510. return 'com|edu|net|org|gov|mil|int|biz|info|name|pro|aero|coop|museum';
  511. }
  512. }
  513. ?>