diff --git a/vendor/PHP-MetaParser/MetaParser.class.php b/vendor/PHP-MetaParser/MetaParser.class.php new file mode 100644 index 0000000..2ec726c --- /dev/null +++ b/vendor/PHP-MetaParser/MetaParser.class.php @@ -0,0 +1,720 @@ + + * @author Oliver Nassar + * @todo add support for paths leading with '//' (aka. use same scheme) + * @notes The following urls provide good examples of the parsing engine: + * http://www.bbc.com/ + * http://www.nytimes.com/ + * http://techcrunch.com/ + * http://metallo.scripps.edu/ + * http://jobs.businessinsider.com/job/3b9d1c8e1e5e31d4e4fefb4a551b8b90/?d=1&source=site_home + * http://www.wikipedia.org + * http://yahoo.com + * http://twitter.com/wikileaks/status/8920530488926208#s + * http://veryawesomeworld.com/awesomebook/inside.html + * @example + * + * // booting + * require_once APP . '/vendors/PHP-Curler/Curler.class.php'; + * require_once APP . '/vendors/PHP-MetaParser/MetaParser.class.php'; + * + * // curling + * $curler = new Curler(); + * $url = 'http://www.bbc.com/'; + * $content = $curler->get($url); + * $parser = new MetaParser($content, $url); + * print_r($parser->getDetails()); + * + */ + class MetaParser + { + /** + * _parsed + * + * @var array + * @access protected + */ + protected $_parsed; + + /** + * _body. + * + * @var string + * @access protected + */ + protected $_body; + + /** + * _url + * + * @var string + * @access protected + */ + protected $_url; + + /** + * __construct + * + * Requires the content body and url to be provided. The url is useful + * to generate relative paths for images, etc. + * + * @access public + * @param String $body + * @param String $url + * @return void + */ + public function __construct($body, $url) + { + $this->_body = $body; + $this->_url = $url; + } + + /** + * resolveFullPath + * + * @see http://ca3.php.net/manual/en/function.realpath.php#86384 + * @access protected + * @param string $addr + * @param string $base + * @return string + */ + protected function _resolveFullPath($addr, $base) + { + // empty address provided + if (empty($addr) === true) { + return $base; + } + + // parse address; if scheme found, doesn't need to be resolved + $parsed = parse_url($addr); + if(array_key_exists('scheme', $parsed)) { + return $addr; + } + + // parse base passed in (will always be a full url) + $parsed = parse_url($base); + + // protocol specific + if (mb_substr($addr, 0, 2) === '//') { + return ($parsed['scheme']) . '://' . mb_substr($addr, 2); + } + // otherwise if the address should go to the top of the tree + elseif ($addr{0} === '/') { + return ($parsed['scheme']) . '://' . ($parsed['host']) . + ($addr); + } + + // if the address doesn't contain any sub-directory calls + if (!strstr($addr, '../')) { + return ($base) . ($addr); + } + + // set-up sub-directory pieces for traversing/replacing + $pieces['addr'] = explode('../', $addr); + $pieces['base'] = explode('/', $parsed['path']); + array_pop($pieces['base']); + $count = count($pieces['addr']) - 1; + + // array of respective sub-directory replacements (from base) + $replacements = array_slice($pieces['base'], 0, 0 - $count); + $replacements = array_filter($replacements); + + // add last non-empty sub-directory as tail + $tail = array_pop($pieces['addr']); + if (empty($tail) === false) { + $replacements[] = $tail; + } + + // return sub-directory traversed address + return ($parsed['scheme']) . '://' . ($parsed['host']) . + '/' . implode('/', $replacements); + } + + /** + * _parseBase + * + * @todo find url's that have various base values, and test them + * @access private + * @return string + */ + private function _parseBase() + { + // search for base tag; return empty string if none found + preg_match_all( + '//imU', + $this->_body, + $bases + ); + + // store url components (will need to be used before value returned) + $components = parse_url($this->_url); + $path = ($components['scheme']) . '://' . ($components['host']); + if (isset($components['path'])) { + $path .= $components['path']; + } else { + $path .= '/'; + } + + // remove any filename that was specified by the path explicitely + $path = preg_replace('/[^\/]*$/U', '', $path); + + // no base tags found + if (empty($bases[2]) === true) { + return $path; + } + + // set base + $base = trim(array_pop($bases[2])); + + // set variable to check for target attribute value + $found = array_pop($bases[0]); + + // if a target attribute found + if (preg_match('/target=/', $found)) { + + // check if target being specified is the document itself (which + // is okay) + $self = preg_match('/target=[\'"]{1}_self[\'"]{1}/', $found); + + // if it's not itself, set the base based on the url being + // 'grabbed' + if (!$self) { + return $path; + } + } + + // resolve path (check for trailing slash; always required) + $resolved = $this->_resolveFullPath($base, $path); + if (!preg_match('/\/$/', $resolved)) { + return ($resolved) . '/'; + } + return $resolved; + } + + /** + * _parseCharset + * + * @access private + * @return String + */ + private function _parseCharset() + { + // get the page's charset (defined as a meta tag) + preg_match( + '#]*?charset\s*=[\s"\']*([^\s"\'/>]*)#i', + $this->_body, + $charset + ); + if (empty($charset) === true) { + return false; + } + + // return charset found + $charset = array_pop($charset); + $charset = strtolower($charset); + $charset = trim($charset); + if ($charset === 'utf8') { + return 'utf-8'; + } + return $charset; + } + + /** + * _parseDescription + * + * @notes not checking the index of the regular expression that + * corresponds to the actual keywords in order to ensure that an + * actual meta tag for keywords was specified. This way I can + * return false if the meta tag isn't there at all + * due to a bug, the second regex does *not* support newlines in + * the meta tag content attribute values + * @access private + * @return string + */ + private function _parseDescription() + { + // grab meta tag; return immediately if false + $description = $this->_parseMetaTag('description'); + if ($description === false) { + return false; + } + + // trim/return + return trim($description); + } + + /** + * _parseFavicon + * + * @access private + * @return string + */ + private function _parseFavicon() + { + // generate default + $parsed = parse_url($this->_url); + $default = ($parsed['scheme']) . '://' . ($parsed['host']) . + '/favicon.ico'; + + // get the page links (icon attribute value leading) + preg_match_all( + '/_body, + $favicons + ); + if (empty($favicons[3]) === true) { + + // get the page links (icon attribute value trailing) + preg_match_all( + '/_body, + $favicons + ); + + // no favicon found + if (empty($favicons[2]) === true) { + return $default; + } + $favicon = array_pop($favicons[2]); + } else { + $favicon = array_pop($favicons[3]); + } + + // resolve full path + $favicon = trim($favicon); + $favicon = $this->_resolveFullPath($favicon, $this->getBase()); + $favicon = str_replace(PHP_EOL, '', $favicon); + return $favicon; + } + + /** + * _parseImages + * + * @notes first expression capture relates to [^*]* rather than .* as + * this was excluding new lines + * @access private + * @return array + */ + private function _parseImages() + { + // get the page images + preg_match_all( + '/_body, + $images + ); + if (empty($images[2]) === true) { + return array(); + } + + // base the images + $images = array_pop($images); + $base = $this->getBase(); + foreach ($images as &$image) { + $image = $this->_resolveFullPath($image, $base); + $image = str_replace(PHP_EOL, '', $image); + } + $images = array_unique($images); + $images = array_values($images); + + // return images found + return $images; + } + + /** + * _parseKeywords + * + * @notes not checking the index of the regular expression that + * corresponds to the actual keywords in order to ensure that an + * actual meta tag for keywords was specified. This way I can + * return false if the meta tag isn't there at all + * @access private + * @return array + */ + private function _parseKeywords() + { + // grab meta tag; return immediately if false + $keywords = $this->_parseMetaTag('keywords'); + if ($keywords === false) { + return false; + } + + // iterate over them, and set as array using comma as delimiter + $keywords = explode(',', $keywords); + foreach ($keywords as &$keyword) { + $keyword = trim($keyword); + } + return $keywords; + } + + /** + * _parseMetaTag + * + * @access protected + * @param string $value + * @return false|string + */ + protected function _parseMetaTag($value, $attr = 'name') + { + // get the page meta-tag (name attribute leading) + preg_match_all( + '/_body, + $tags + ); + + // meta tag not found (not that it's empty, but not-found) + if (empty($tags[3]) === true) { + + // get the page meta-tag (name attribute trailing) + preg_match_all( + '/_body, + $tags + ); + + // no meta-tag found + if (empty($tags[3]) === true) { + return false; + } + + // return value found + return array_pop($tags[2]); + } + + // return meta-tag found + return array_pop($tags[4]); + } + + /** + * _parseOpenGraphKeys + * + * @access protected + * @return array + */ + protected function _parseOpenGraphKeys() + { + preg_match_all('/([\'|"]{1})og:([a-zA-Z0-9\-:_]{1,25})\1/', $this->_body, $keys); + return array_pop($keys); + } + + /** + * _parseSocialNetwork + * + * @access public + * @param string $network + * @return string|false + */ + public function _parseSocialNetwork($network) + { + $hosts = array( + 'facebook' => 'facebook.com', + 'twitter' => 'twitter.com', + 'instagram' => 'instagram.com', + 'pinterest' => 'pinterest.com' + ); + $exemptions = array( + 'facebook' => array(), + 'twitter' => array(), + 'instagram' => array(), + 'pinterest' => array('pin') + ); + $host = $hosts[$network]; + $host = str_replace('.', '\.', $host); + $pattern = '/href=(\'|").+' . ($host) . '\/([^"\'\?\#\/]+)\1/U'; + preg_match($pattern, $this->_body, $matches); + if (count($matches) > 0) { + $id = array_pop($matches); + if (in_array($id, $exemptions[$network]) === false) { + return $id; + } + } + $pattern = '/.+' . ($host) . '\/([^"\'\?\#\/]+)/'; + preg_match($pattern, $this->_body, $matches); + if (count($matches) > 0) { + $id = array_pop($matches); + if (in_array($id, $exemptions[$network]) === false) { + return $id; + } + } + return false; + } + + /** + * _parseTitle + * + * @access private + * @return string + */ + private function _parseTitle() + { + // get the page's title + // preg_match('/]*>([^<]+)<\/title>/i', $this->_body, $titles); + preg_match('/]*>([^<]+)<\/title>/im', $this->_body, $titles); + if (empty($titles) === true) { + return false; + } + + // return title found + return trim(array_pop($titles)); + } + + /** + * _parseYouTubeChannel + * + * @access public + * @return string|false + */ + public function _parseYouTubeChannel() + { + $host = 'youtube.com'; + $host = str_replace('.', '\.', $host); + $pattern = '/href="[^"]+' . ($host) . '\/channel[\\\]?\/([^"\?\#\/]+)/'; + preg_match($pattern, $this->_body, $matches); + if (count($matches) > 0) { + return array_pop($matches); + } + return false; + } + + /** + * getBase + * + * @notes do not need to check for false value after _parseFavicon as + * default will always be returned (eg. domain.com/favicon.ico) + * @access public + * @return false|string + */ + public function getBase() + { + // return base found, if any + if (isset($this->_parsed['base'])) { + return $this->_parsed['base']; + } + + // parse base/return + $base = $this->_parseBase(); + $this->_parsed['base'] = $base; + return $base; + } + + /** + * getCharset + * + * Returns the charset defined in the document, which may or may not be + * the charset that is rendered by the browser. This is because + * charsets passed from a server directive supercede those defined in + * the document. + * + * @see + * @access public + * @return false|string + */ + public function getCharset() + { + // return charset found, if any + if (isset($this->_parsed['charset'])) { + return $this->_parsed['charset']; + } + + // parse charset/return + $charset = $this->_parseCharset(); + if ($charset === false) { + return false; + } + $this->_parsed['charset'] = $charset; + return $charset; + } + + /** + * getDescription + * + * @access public + * @return false|string + */ + public function getDescription() + { + // return description found, if any + if (isset($this->_parsed['description'])) { + return $this->_parsed['description']; + } + + // parse description/return + $description = $this->_parseDescription(); + if ($description === false) { + return false; + } + $this->_parsed['description'] = $description; + return $description; + } + + /** + * getDetails + * + * @access public + * @return array + */ + public function getDetails() + { + // return relevant meta data + return array( + 'base' => $this->getBase(), + 'charset' => $this->getCharset(), + 'favicon' => $this->getFavicon(), + 'meta' => array( + 'description' => $this->getDescription(), + 'keywords' => $this->getKeywords() + ), + 'images' => $this->getImages(), + 'openGraph' => $this->getOpenGraph(), + 'social' => $this->getSocial(), + 'title' => $this->getTitle(), + 'url' => $this->getUrl() + ); + } + + /** + * getFavicon + * + * @notes do not need to check for false value after _parseFavicon as + * default will always be returned (eg. domain.com/favicon.ico) + * @access public + * @return false|string + */ + public function getFavicon() + { + // return favicon found, if any + if (isset($this->_parsed['favicon'])) { + return $this->_parsed['favicon']; + } + + // parse favicon/return + $favicon = $this->_parseFavicon(); + $this->_parsed['favicon'] = $favicon; + return $favicon; + } + + /** + * getImages + * + * @access public + * @return false|array + */ + public function getImages() + { + // return images found, if any + if (isset($this->_parsed['images'])) { + return $this->_parsed['images']; + } + + // parse images/return + $images = $this->_parseImages(); + $this->_parsed['images'] = $images; + return $images; + } + + /** + * getKeywords + * + * @access public + * @return false|array + */ + public function getKeywords() + { + // return keywords found, if any + if (isset($this->_parsed['keywords'])) { + return $this->_parsed['keywords']; + } + + // parse keywords if found; encode; return + $keywords = $this->_parseKeywords(); + if ($keywords === false) { + return false; + } + $this->_parsed['keywords'] = $keywords; + return $keywords; + } + + /** + * getOpenGraph + * + * @access public + * @return array + */ + public function getOpenGraph() + { + $graph = array(); + $keys = $this->_parseOpenGraphKeys(); + foreach ($keys as $key) { + $graph[$key] = $this->_parseMetaTag('og:' . ($key), 'property'); + } + + // resolve path to open graph image, if found + if (in_array('image', $keys)) { + $graph['imagePath'] = $this->_resolveFullPath( + $graph['image'], + $this->getBase() + ); + } + return $graph; + } + + /** + * getSocial + * + * @access public + * @return array + */ + public function getSocial() + { + return array( + 'facebook' => $this->_parseSocialNetwork('facebook'), + 'twitter' => $this->_parseSocialNetwork('twitter'), + 'instagram' => $this->_parseSocialNetwork('instagram'), + 'pinterest' => $this->_parseSocialNetwork('pinterest'), + 'youTube' => $this->_parseYouTubeChannel() + ); + } + + /** + * getTitle + * + * @access public + * @return false|string + */ + public function getTitle() + { + // return title found, if any + if (isset($this->_parsed['title'])) { + return $this->_parsed['title']; + } + + // parse title/return + $title = $this->_parseTitle(); + if ($title === false) { + return false; + } + $this->_parsed['title'] = $title; + return $title; + } + + /** + * getUrl + * + * @access public + * @return string + */ + public function getUrl() + { + return $this->_url; + } + }