* @author Oliver Nassar * @todo add support for paths leading with '//' (aka. use same scheme) * @notes The following urls provide good examples of the parsing engine: * http://www.bbc.com/ * http://www.nytimes.com/ * http://techcrunch.com/ * http://metallo.scripps.edu/ * http://jobs.businessinsider.com/job/3b9d1c8e1e5e31d4e4fefb4a551b8b90/?d=1&source=site_home * http://www.wikipedia.org * http://yahoo.com * http://twitter.com/wikileaks/status/8920530488926208#s * http://veryawesomeworld.com/awesomebook/inside.html * @example * * // booting * require_once APP . '/vendors/PHP-Curler/Curler.class.php'; * require_once APP . '/vendors/PHP-MetaParser/MetaParser.class.php'; * * // curling * $curler = new Curler(); * $url = 'http://www.bbc.com/'; * $content = $curler->get($url); * $parser = new MetaParser($content, $url); * print_r($parser->getDetails()); * */ class MetaParser { /** * _parsed * * @var array * @access protected */ protected $_parsed; /** * _body. * * @var string * @access protected */ protected $_body; /** * _url * * @var string * @access protected */ protected $_url; /** * __construct * * Requires the content body and url to be provided. The url is useful * to generate relative paths for images, etc. * * @access public * @param String $body * @param String $url * @return void */ public function __construct($body, $url) { $this->_body = $body; $this->_url = $url; } /** * resolveFullPath * * @see http://ca3.php.net/manual/en/function.realpath.php#86384 * @access protected * @param string $addr * @param string $base * @return string */ protected function _resolveFullPath($addr, $base) { // empty address provided if (empty($addr) === true) { return $base; } // parse address; if scheme found, doesn't need to be resolved $parsed = parse_url($addr); if(array_key_exists('scheme', $parsed)) { return $addr; } // parse base passed in (will always be a full url) $parsed = parse_url($base); // protocol specific if (mb_substr($addr, 0, 2) === '//') { return ($parsed['scheme']) . '://' . mb_substr($addr, 2); } // otherwise if the address should go to the top of the tree elseif ($addr{0} === '/') { return ($parsed['scheme']) . '://' . ($parsed['host']) . ($addr); } // if the address doesn't contain any sub-directory calls if (!strstr($addr, '../')) { return ($base) . ($addr); } // set-up sub-directory pieces for traversing/replacing $pieces['addr'] = explode('../', $addr); $pieces['base'] = explode('/', $parsed['path']); array_pop($pieces['base']); $count = count($pieces['addr']) - 1; // array of respective sub-directory replacements (from base) $replacements = array_slice($pieces['base'], 0, 0 - $count); $replacements = array_filter($replacements); // add last non-empty sub-directory as tail $tail = array_pop($pieces['addr']); if (empty($tail) === false) { $replacements[] = $tail; } // return sub-directory traversed address return ($parsed['scheme']) . '://' . ($parsed['host']) . '/' . implode('/', $replacements); } /** * _parseBase * * @todo find url's that have various base values, and test them * @access private * @return string */ private function _parseBase() { // search for base tag; return empty string if none found preg_match_all( '//imU', $this->_body, $bases ); // store url components (will need to be used before value returned) $components = parse_url($this->_url); $path = ($components['scheme']) . '://' . ($components['host']); if (isset($components['path'])) { $path .= $components['path']; } else { $path .= '/'; } // remove any filename that was specified by the path explicitely $path = preg_replace('/[^\/]*$/U', '', $path); // no base tags found if (empty($bases[2]) === true) { return $path; } // set base $base = trim(array_pop($bases[2])); // set variable to check for target attribute value $found = array_pop($bases[0]); // if a target attribute found if (preg_match('/target=/', $found)) { // check if target being specified is the document itself (which // is okay) $self = preg_match('/target=[\'"]{1}_self[\'"]{1}/', $found); // if it's not itself, set the base based on the url being // 'grabbed' if (!$self) { return $path; } } // resolve path (check for trailing slash; always required) $resolved = $this->_resolveFullPath($base, $path); if (!preg_match('/\/$/', $resolved)) { return ($resolved) . '/'; } return $resolved; } /** * _parseCharset * * @access private * @return String */ private function _parseCharset() { // get the page's charset (defined as a meta tag) preg_match( '#]*?charset\s*=[\s"\']*([^\s"\'/>]*)#i', $this->_body, $charset ); if (empty($charset) === true) { return false; } // return charset found $charset = array_pop($charset); $charset = strtolower($charset); $charset = trim($charset); if ($charset === 'utf8') { return 'utf-8'; } return $charset; } /** * _parseDescription * * @notes not checking the index of the regular expression that * corresponds to the actual keywords in order to ensure that an * actual meta tag for keywords was specified. This way I can * return false if the meta tag isn't there at all * due to a bug, the second regex does *not* support newlines in * the meta tag content attribute values * @access private * @return string */ private function _parseDescription() { // grab meta tag; return immediately if false $description = $this->_parseMetaTag('description'); if ($description === false) { return false; } // trim/return return trim($description); } /** * _parseFavicon * * @access private * @return string */ private function _parseFavicon() { // generate default $parsed = parse_url($this->_url); $default = ($parsed['scheme']) . '://' . ($parsed['host']) . '/favicon.ico'; // get the page links (icon attribute value leading) preg_match_all( '/_body, $favicons ); if (empty($favicons[3]) === true) { // get the page links (icon attribute value trailing) preg_match_all( '/_body, $favicons ); // no favicon found if (empty($favicons[2]) === true) { return $default; } $favicon = array_pop($favicons[2]); } else { $favicon = array_pop($favicons[3]); } // resolve full path $favicon = trim($favicon); $favicon = $this->_resolveFullPath($favicon, $this->getBase()); $favicon = str_replace(PHP_EOL, '', $favicon); return $favicon; } /** * _parseImages * * @notes first expression capture relates to [^*]* rather than .* as * this was excluding new lines * @access private * @return array */ private function _parseImages() { // get the page images preg_match_all( '/_body, $images ); if (empty($images[2]) === true) { return array(); } // base the images $images = array_pop($images); $base = $this->getBase(); foreach ($images as &$image) { $image = $this->_resolveFullPath($image, $base); $image = str_replace(PHP_EOL, '', $image); } $images = array_unique($images); $images = array_values($images); // return images found return $images; } /** * _parseKeywords * * @notes not checking the index of the regular expression that * corresponds to the actual keywords in order to ensure that an * actual meta tag for keywords was specified. This way I can * return false if the meta tag isn't there at all * @access private * @return array */ private function _parseKeywords() { // grab meta tag; return immediately if false $keywords = $this->_parseMetaTag('keywords'); if ($keywords === false) { return false; } // iterate over them, and set as array using comma as delimiter $keywords = explode(',', $keywords); foreach ($keywords as &$keyword) { $keyword = trim($keyword); } return $keywords; } /** * _parseMetaTag * * @access protected * @param string $value * @return false|string */ protected function _parseMetaTag($value, $attr = 'name') { // get the page meta-tag (name attribute leading) preg_match_all( '/_body, $tags ); // meta tag not found (not that it's empty, but not-found) if (empty($tags[3]) === true) { // get the page meta-tag (name attribute trailing) preg_match_all( '/_body, $tags ); // no meta-tag found if (empty($tags[3]) === true) { return false; } // return value found return array_pop($tags[2]); } // return meta-tag found return array_pop($tags[4]); } /** * _parseOpenGraphKeys * * @access protected * @return array */ protected function _parseOpenGraphKeys() { preg_match_all('/([\'|"]{1})og:([a-zA-Z0-9\-:_]{1,25})\1/', $this->_body, $keys); return array_pop($keys); } /** * _parseSocialNetwork * * @access public * @param string $network * @return string|false */ public function _parseSocialNetwork($network) { $hosts = array( 'facebook' => 'facebook.com', 'twitter' => 'twitter.com', 'instagram' => 'instagram.com', 'pinterest' => 'pinterest.com' ); $exemptions = array( 'facebook' => array(), 'twitter' => array(), 'instagram' => array(), 'pinterest' => array('pin') ); $host = $hosts[$network]; $host = str_replace('.', '\.', $host); $pattern = '/href=(\'|").+' . ($host) . '\/([^"\'\?\#\/]+)\1/U'; preg_match($pattern, $this->_body, $matches); if (count($matches) > 0) { $id = array_pop($matches); if (in_array($id, $exemptions[$network]) === false) { return $id; } } $pattern = '/.+' . ($host) . '\/([^"\'\?\#\/]+)/'; preg_match($pattern, $this->_body, $matches); if (count($matches) > 0) { $id = array_pop($matches); if (in_array($id, $exemptions[$network]) === false) { return $id; } } return false; } /** * _parseTitle * * @access private * @return string */ private function _parseTitle() { // get the page's title // preg_match('/]*>([^<]+)<\/title>/i', $this->_body, $titles); preg_match('/]*>([^<]+)<\/title>/im', $this->_body, $titles); if (empty($titles) === true) { return false; } // return title found return trim(array_pop($titles)); } /** * _parseYouTubeChannel * * @access public * @return string|false */ public function _parseYouTubeChannel() { $host = 'youtube.com'; $host = str_replace('.', '\.', $host); $pattern = '/href="[^"]+' . ($host) . '\/channel[\\\]?\/([^"\?\#\/]+)/'; preg_match($pattern, $this->_body, $matches); if (count($matches) > 0) { return array_pop($matches); } return false; } /** * getBase * * @notes do not need to check for false value after _parseFavicon as * default will always be returned (eg. domain.com/favicon.ico) * @access public * @return false|string */ public function getBase() { // return base found, if any if (isset($this->_parsed['base'])) { return $this->_parsed['base']; } // parse base/return $base = $this->_parseBase(); $this->_parsed['base'] = $base; return $base; } /** * getCharset * * Returns the charset defined in the document, which may or may not be * the charset that is rendered by the browser. This is because * charsets passed from a server directive supercede those defined in * the document. * * @see * @access public * @return false|string */ public function getCharset() { // return charset found, if any if (isset($this->_parsed['charset'])) { return $this->_parsed['charset']; } // parse charset/return $charset = $this->_parseCharset(); if ($charset === false) { return false; } $this->_parsed['charset'] = $charset; return $charset; } /** * getDescription * * @access public * @return false|string */ public function getDescription() { // return description found, if any if (isset($this->_parsed['description'])) { return $this->_parsed['description']; } // parse description/return $description = $this->_parseDescription(); if ($description === false) { return false; } $this->_parsed['description'] = $description; return $description; } /** * getDetails * * @access public * @return array */ public function getDetails() { // return relevant meta data return array( 'base' => $this->getBase(), 'charset' => $this->getCharset(), 'favicon' => $this->getFavicon(), 'meta' => array( 'description' => $this->getDescription(), 'keywords' => $this->getKeywords() ), 'images' => $this->getImages(), 'openGraph' => $this->getOpenGraph(), 'social' => $this->getSocial(), 'title' => $this->getTitle(), 'url' => $this->getURL() ); } /** * getFavicon * * @notes do not need to check for false value after _parseFavicon as * default will always be returned (eg. domain.com/favicon.ico) * @access public * @return false|string */ public function getFavicon() { // return favicon found, if any if (isset($this->_parsed['favicon'])) { return $this->_parsed['favicon']; } // parse favicon/return $favicon = $this->_parseFavicon(); $this->_parsed['favicon'] = $favicon; return $favicon; } /** * getImages * * @access public * @return false|array */ public function getImages() { // return images found, if any if (isset($this->_parsed['images'])) { return $this->_parsed['images']; } // parse images/return $images = $this->_parseImages(); $this->_parsed['images'] = $images; return $images; } /** * getKeywords * * @access public * @return false|array */ public function getKeywords() { // return keywords found, if any if (isset($this->_parsed['keywords'])) { return $this->_parsed['keywords']; } // parse keywords if found; encode; return $keywords = $this->_parseKeywords(); if ($keywords === false) { return false; } $this->_parsed['keywords'] = $keywords; return $keywords; } /** * getOpenGraph * * @access public * @return array */ public function getOpenGraph() { $graph = array(); $keys = $this->_parseOpenGraphKeys(); foreach ($keys as $key) { $graph[$key] = $this->_parseMetaTag('og:' . ($key), 'property'); } // resolve path to open graph image, if found if (in_array('image', $keys)) { $graph['imagePath'] = $this->_resolveFullPath( $graph['image'], $this->getBase() ); } return $graph; } /** * getSocial * * @access public * @return array */ public function getSocial() { return array( 'facebook' => $this->_parseSocialNetwork('facebook'), 'twitter' => $this->_parseSocialNetwork('twitter'), 'instagram' => $this->_parseSocialNetwork('instagram'), 'pinterest' => $this->_parseSocialNetwork('pinterest'), 'youTube' => $this->_parseYouTubeChannel() ); } /** * getTitle * * @access public * @return false|string */ public function getTitle() { // return title found, if any if (isset($this->_parsed['title'])) { return $this->_parsed['title']; } // parse title/return $title = $this->_parseTitle(); if ($title === false) { return false; } $this->_parsed['title'] = $title; return $title; } /** * getURL * * @access public * @return string */ public function getURL() { return $this->_url; } }