Create MetaParser.class.php
This commit is contained in:
parent
79456367fa
commit
26992d0a26
720
vendor/PHP-MetaParser/MetaParser.class.php
vendored
Normal file
720
vendor/PHP-MetaParser/MetaParser.class.php
vendored
Normal file
|
@ -0,0 +1,720 @@
|
|||
<?php
|
||||
|
||||
/**
|
||||
* MetaParser
|
||||
*
|
||||
* Parses content for meta and open graph details. Useful when used with a
|
||||
* curling library.
|
||||
*
|
||||
* @see <https://github.com/onassar/PHP-Curler>
|
||||
* @author Oliver Nassar <onassar@gmail.com>
|
||||
* @todo add support for paths leading with '//' (aka. use same scheme)
|
||||
* @notes The following urls provide good examples of the parsing engine:
|
||||
* http://www.bbc.com/
|
||||
* http://www.nytimes.com/
|
||||
* http://techcrunch.com/
|
||||
* http://metallo.scripps.edu/
|
||||
* http://jobs.businessinsider.com/job/3b9d1c8e1e5e31d4e4fefb4a551b8b90/?d=1&source=site_home
|
||||
* http://www.wikipedia.org
|
||||
* http://yahoo.com
|
||||
* http://twitter.com/wikileaks/status/8920530488926208#s
|
||||
* http://veryawesomeworld.com/awesomebook/inside.html
|
||||
* @example
|
||||
* <code>
|
||||
* // booting
|
||||
* require_once APP . '/vendors/PHP-Curler/Curler.class.php';
|
||||
* require_once APP . '/vendors/PHP-MetaParser/MetaParser.class.php';
|
||||
*
|
||||
* // curling
|
||||
* $curler = new Curler();
|
||||
* $url = 'http://www.bbc.com/';
|
||||
* $content = $curler->get($url);
|
||||
* $parser = new MetaParser($content, $url);
|
||||
* print_r($parser->getDetails());
|
||||
* <code>
|
||||
*/
|
||||
class MetaParser
|
||||
{
|
||||
/**
|
||||
* _parsed
|
||||
*
|
||||
* @var array
|
||||
* @access protected
|
||||
*/
|
||||
protected $_parsed;
|
||||
|
||||
/**
|
||||
* _body.
|
||||
*
|
||||
* @var string
|
||||
* @access protected
|
||||
*/
|
||||
protected $_body;
|
||||
|
||||
/**
|
||||
* _url
|
||||
*
|
||||
* @var string
|
||||
* @access protected
|
||||
*/
|
||||
protected $_url;
|
||||
|
||||
/**
|
||||
* __construct
|
||||
*
|
||||
* Requires the content body and url to be provided. The url is useful
|
||||
* to generate relative paths for images, etc.
|
||||
*
|
||||
* @access public
|
||||
* @param String $body
|
||||
* @param String $url
|
||||
* @return void
|
||||
*/
|
||||
public function __construct($body, $url)
|
||||
{
|
||||
$this->_body = $body;
|
||||
$this->_url = $url;
|
||||
}
|
||||
|
||||
/**
|
||||
* resolveFullPath
|
||||
*
|
||||
* @see http://ca3.php.net/manual/en/function.realpath.php#86384
|
||||
* @access protected
|
||||
* @param string $addr
|
||||
* @param string $base
|
||||
* @return string
|
||||
*/
|
||||
protected function _resolveFullPath($addr, $base)
|
||||
{
|
||||
// empty address provided
|
||||
if (empty($addr) === true) {
|
||||
return $base;
|
||||
}
|
||||
|
||||
// parse address; if scheme found, doesn't need to be resolved
|
||||
$parsed = parse_url($addr);
|
||||
if(array_key_exists('scheme', $parsed)) {
|
||||
return $addr;
|
||||
}
|
||||
|
||||
// parse base passed in (will always be a full url)
|
||||
$parsed = parse_url($base);
|
||||
|
||||
// protocol specific
|
||||
if (mb_substr($addr, 0, 2) === '//') {
|
||||
return ($parsed['scheme']) . '://' . mb_substr($addr, 2);
|
||||
}
|
||||
// otherwise if the address should go to the top of the tree
|
||||
elseif ($addr{0} === '/') {
|
||||
return ($parsed['scheme']) . '://' . ($parsed['host']) .
|
||||
($addr);
|
||||
}
|
||||
|
||||
// if the address doesn't contain any sub-directory calls
|
||||
if (!strstr($addr, '../')) {
|
||||
return ($base) . ($addr);
|
||||
}
|
||||
|
||||
// set-up sub-directory pieces for traversing/replacing
|
||||
$pieces['addr'] = explode('../', $addr);
|
||||
$pieces['base'] = explode('/', $parsed['path']);
|
||||
array_pop($pieces['base']);
|
||||
$count = count($pieces['addr']) - 1;
|
||||
|
||||
// array of respective sub-directory replacements (from base)
|
||||
$replacements = array_slice($pieces['base'], 0, 0 - $count);
|
||||
$replacements = array_filter($replacements);
|
||||
|
||||
// add last non-empty sub-directory as tail
|
||||
$tail = array_pop($pieces['addr']);
|
||||
if (empty($tail) === false) {
|
||||
$replacements[] = $tail;
|
||||
}
|
||||
|
||||
// return sub-directory traversed address
|
||||
return ($parsed['scheme']) . '://' . ($parsed['host']) .
|
||||
'/' . implode('/', $replacements);
|
||||
}
|
||||
|
||||
/**
|
||||
* _parseBase
|
||||
*
|
||||
* @todo find url's that have various base values, and test them
|
||||
* @access private
|
||||
* @return string
|
||||
*/
|
||||
private function _parseBase()
|
||||
{
|
||||
// search for base tag; return empty string if none found
|
||||
preg_match_all(
|
||||
'/<base.*href=(\'|")(.*)\1.*>/imU',
|
||||
$this->_body,
|
||||
$bases
|
||||
);
|
||||
|
||||
// store url components (will need to be used before value returned)
|
||||
$components = parse_url($this->_url);
|
||||
$path = ($components['scheme']) . '://' . ($components['host']);
|
||||
if (isset($components['path'])) {
|
||||
$path .= $components['path'];
|
||||
} else {
|
||||
$path .= '/';
|
||||
}
|
||||
|
||||
// remove any filename that was specified by the path explicitely
|
||||
$path = preg_replace('/[^\/]*$/U', '', $path);
|
||||
|
||||
// no base tags found
|
||||
if (empty($bases[2]) === true) {
|
||||
return $path;
|
||||
}
|
||||
|
||||
// set base
|
||||
$base = trim(array_pop($bases[2]));
|
||||
|
||||
// set variable to check for target attribute value
|
||||
$found = array_pop($bases[0]);
|
||||
|
||||
// if a target attribute found
|
||||
if (preg_match('/target=/', $found)) {
|
||||
|
||||
// check if target being specified is the document itself (which
|
||||
// is okay)
|
||||
$self = preg_match('/target=[\'"]{1}_self[\'"]{1}/', $found);
|
||||
|
||||
// if it's not itself, set the base based on the url being
|
||||
// 'grabbed'
|
||||
if (!$self) {
|
||||
return $path;
|
||||
}
|
||||
}
|
||||
|
||||
// resolve path (check for trailing slash; always required)
|
||||
$resolved = $this->_resolveFullPath($base, $path);
|
||||
if (!preg_match('/\/$/', $resolved)) {
|
||||
return ($resolved) . '/';
|
||||
}
|
||||
return $resolved;
|
||||
}
|
||||
|
||||
/**
|
||||
* _parseCharset
|
||||
*
|
||||
* @access private
|
||||
* @return String
|
||||
*/
|
||||
private function _parseCharset()
|
||||
{
|
||||
// get the page's charset (defined as a meta tag)
|
||||
preg_match(
|
||||
'#<meta(?!\s*(?:name|value)\s*=)[^>]*?charset\s*=[\s"\']*([^\s"\'/>]*)#i',
|
||||
$this->_body,
|
||||
$charset
|
||||
);
|
||||
if (empty($charset) === true) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// return charset found
|
||||
$charset = array_pop($charset);
|
||||
$charset = strtolower($charset);
|
||||
$charset = trim($charset);
|
||||
if ($charset === 'utf8') {
|
||||
return 'utf-8';
|
||||
}
|
||||
return $charset;
|
||||
}
|
||||
|
||||
/**
|
||||
* _parseDescription
|
||||
*
|
||||
* @notes not checking the index of the regular expression that
|
||||
* corresponds to the actual keywords in order to ensure that an
|
||||
* actual meta tag for keywords was specified. This way I can
|
||||
* return false if the meta tag isn't there at all
|
||||
* due to a bug, the second regex does *not* support newlines in
|
||||
* the meta tag content attribute values
|
||||
* @access private
|
||||
* @return string
|
||||
*/
|
||||
private function _parseDescription()
|
||||
{
|
||||
// grab meta tag; return immediately if false
|
||||
$description = $this->_parseMetaTag('description');
|
||||
if ($description === false) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// trim/return
|
||||
return trim($description);
|
||||
}
|
||||
|
||||
/**
|
||||
* _parseFavicon
|
||||
*
|
||||
* @access private
|
||||
* @return string
|
||||
*/
|
||||
private function _parseFavicon()
|
||||
{
|
||||
// generate default
|
||||
$parsed = parse_url($this->_url);
|
||||
$default = ($parsed['scheme']) . '://' . ($parsed['host']) .
|
||||
'/favicon.ico';
|
||||
|
||||
// get the page links (icon attribute value leading)
|
||||
preg_match_all(
|
||||
'/<link.+rel=(\'|").*[^-]\bicon\b.{0,20}href=(\'|")(.+)\2/imU',
|
||||
$this->_body,
|
||||
$favicons
|
||||
);
|
||||
if (empty($favicons[3]) === true) {
|
||||
|
||||
// get the page links (icon attribute value trailing)
|
||||
preg_match_all(
|
||||
'/<link.+href=(\'|")(.+)\1.{0,20}rel=(\'|").*[^-]\bicon\b/imU',
|
||||
$this->_body,
|
||||
$favicons
|
||||
);
|
||||
|
||||
// no favicon found
|
||||
if (empty($favicons[2]) === true) {
|
||||
return $default;
|
||||
}
|
||||
$favicon = array_pop($favicons[2]);
|
||||
} else {
|
||||
$favicon = array_pop($favicons[3]);
|
||||
}
|
||||
|
||||
// resolve full path
|
||||
$favicon = trim($favicon);
|
||||
$favicon = $this->_resolveFullPath($favicon, $this->getBase());
|
||||
$favicon = str_replace(PHP_EOL, '', $favicon);
|
||||
return $favicon;
|
||||
}
|
||||
|
||||
/**
|
||||
* _parseImages
|
||||
*
|
||||
* @notes first expression capture relates to [^*]* rather than .* as
|
||||
* this was excluding new lines
|
||||
* @access private
|
||||
* @return array
|
||||
*/
|
||||
private function _parseImages()
|
||||
{
|
||||
// get the page images
|
||||
preg_match_all(
|
||||
'/<img[^*]*src=(\'|")(.+)\1/imU',
|
||||
$this->_body,
|
||||
$images
|
||||
);
|
||||
if (empty($images[2]) === true) {
|
||||
return array();
|
||||
}
|
||||
|
||||
// base the images
|
||||
$images = array_pop($images);
|
||||
$base = $this->getBase();
|
||||
foreach ($images as &$image) {
|
||||
$image = $this->_resolveFullPath($image, $base);
|
||||
$image = str_replace(PHP_EOL, '', $image);
|
||||
}
|
||||
$images = array_unique($images);
|
||||
$images = array_values($images);
|
||||
|
||||
// return images found
|
||||
return $images;
|
||||
}
|
||||
|
||||
/**
|
||||
* _parseKeywords
|
||||
*
|
||||
* @notes not checking the index of the regular expression that
|
||||
* corresponds to the actual keywords in order to ensure that an
|
||||
* actual meta tag for keywords was specified. This way I can
|
||||
* return false if the meta tag isn't there at all
|
||||
* @access private
|
||||
* @return array
|
||||
*/
|
||||
private function _parseKeywords()
|
||||
{
|
||||
// grab meta tag; return immediately if false
|
||||
$keywords = $this->_parseMetaTag('keywords');
|
||||
if ($keywords === false) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// iterate over them, and set as array using comma as delimiter
|
||||
$keywords = explode(',', $keywords);
|
||||
foreach ($keywords as &$keyword) {
|
||||
$keyword = trim($keyword);
|
||||
}
|
||||
return $keywords;
|
||||
}
|
||||
|
||||
/**
|
||||
* _parseMetaTag
|
||||
*
|
||||
* @access protected
|
||||
* @param string $value
|
||||
* @return false|string
|
||||
*/
|
||||
protected function _parseMetaTag($value, $attr = 'name')
|
||||
{
|
||||
// get the page meta-tag (name attribute leading)
|
||||
preg_match_all(
|
||||
'/<meta.+' . ($attr) . '=(\'|")(\bdc\.\b)?\b' .
|
||||
($value) . '\b\1.+content=(\'|")(.*)\3/imU',
|
||||
$this->_body,
|
||||
$tags
|
||||
);
|
||||
|
||||
// meta tag not found (not that it's empty, but not-found)
|
||||
if (empty($tags[3]) === true) {
|
||||
|
||||
// get the page meta-tag (name attribute trailing)
|
||||
preg_match_all(
|
||||
'/<meta.+content=(\'|")(.*)\1.+' . ($attr) .
|
||||
'=(\'|")(\bdc\.\b)?\b' . ($value) . '\b\3.+/imU',
|
||||
$this->_body,
|
||||
$tags
|
||||
);
|
||||
|
||||
// no meta-tag found
|
||||
if (empty($tags[3]) === true) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// return value found
|
||||
return array_pop($tags[2]);
|
||||
}
|
||||
|
||||
// return meta-tag found
|
||||
return array_pop($tags[4]);
|
||||
}
|
||||
|
||||
/**
|
||||
* _parseOpenGraphKeys
|
||||
*
|
||||
* @access protected
|
||||
* @return array
|
||||
*/
|
||||
protected function _parseOpenGraphKeys()
|
||||
{
|
||||
preg_match_all('/([\'|"]{1})og:([a-zA-Z0-9\-:_]{1,25})\1/', $this->_body, $keys);
|
||||
return array_pop($keys);
|
||||
}
|
||||
|
||||
/**
|
||||
* _parseSocialNetwork
|
||||
*
|
||||
* @access public
|
||||
* @param string $network
|
||||
* @return string|false
|
||||
*/
|
||||
public function _parseSocialNetwork($network)
|
||||
{
|
||||
$hosts = array(
|
||||
'facebook' => 'facebook.com',
|
||||
'twitter' => 'twitter.com',
|
||||
'instagram' => 'instagram.com',
|
||||
'pinterest' => 'pinterest.com'
|
||||
);
|
||||
$exemptions = array(
|
||||
'facebook' => array(),
|
||||
'twitter' => array(),
|
||||
'instagram' => array(),
|
||||
'pinterest' => array('pin')
|
||||
);
|
||||
$host = $hosts[$network];
|
||||
$host = str_replace('.', '\.', $host);
|
||||
$pattern = '/href=(\'|").+' . ($host) . '\/([^"\'\?\#\/]+)\1/U';
|
||||
preg_match($pattern, $this->_body, $matches);
|
||||
if (count($matches) > 0) {
|
||||
$id = array_pop($matches);
|
||||
if (in_array($id, $exemptions[$network]) === false) {
|
||||
return $id;
|
||||
}
|
||||
}
|
||||
$pattern = '/.+' . ($host) . '\/([^"\'\?\#\/]+)/';
|
||||
preg_match($pattern, $this->_body, $matches);
|
||||
if (count($matches) > 0) {
|
||||
$id = array_pop($matches);
|
||||
if (in_array($id, $exemptions[$network]) === false) {
|
||||
return $id;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* _parseTitle
|
||||
*
|
||||
* @access private
|
||||
* @return string
|
||||
*/
|
||||
private function _parseTitle()
|
||||
{
|
||||
// get the page's title
|
||||
// preg_match('/<title[^>]*>([^<]+)<\/title>/i', $this->_body, $titles);
|
||||
preg_match('/<title[^>]*>([^<]+)<\/title>/im', $this->_body, $titles);
|
||||
if (empty($titles) === true) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// return title found
|
||||
return trim(array_pop($titles));
|
||||
}
|
||||
|
||||
/**
|
||||
* _parseYouTubeChannel
|
||||
*
|
||||
* @access public
|
||||
* @return string|false
|
||||
*/
|
||||
public function _parseYouTubeChannel()
|
||||
{
|
||||
$host = 'youtube.com';
|
||||
$host = str_replace('.', '\.', $host);
|
||||
$pattern = '/href="[^"]+' . ($host) . '\/channel[\\\]?\/([^"\?\#\/]+)/';
|
||||
preg_match($pattern, $this->_body, $matches);
|
||||
if (count($matches) > 0) {
|
||||
return array_pop($matches);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* getBase
|
||||
*
|
||||
* @notes do not need to check for false value after _parseFavicon as
|
||||
* default will always be returned (eg. domain.com/favicon.ico)
|
||||
* @access public
|
||||
* @return false|string
|
||||
*/
|
||||
public function getBase()
|
||||
{
|
||||
// return base found, if any
|
||||
if (isset($this->_parsed['base'])) {
|
||||
return $this->_parsed['base'];
|
||||
}
|
||||
|
||||
// parse base/return
|
||||
$base = $this->_parseBase();
|
||||
$this->_parsed['base'] = $base;
|
||||
return $base;
|
||||
}
|
||||
|
||||
/**
|
||||
* getCharset
|
||||
*
|
||||
* Returns the charset defined in the document, which may or may not be
|
||||
* the charset that is rendered by the browser. This is because
|
||||
* charsets passed from a server directive supercede those defined in
|
||||
* the document.
|
||||
*
|
||||
* @see <http://stackoverflow.com/questions/3458217/how-to-use-regular-expression-to-match-the-charset-string-in-html>
|
||||
* @access public
|
||||
* @return false|string
|
||||
*/
|
||||
public function getCharset()
|
||||
{
|
||||
// return charset found, if any
|
||||
if (isset($this->_parsed['charset'])) {
|
||||
return $this->_parsed['charset'];
|
||||
}
|
||||
|
||||
// parse charset/return
|
||||
$charset = $this->_parseCharset();
|
||||
if ($charset === false) {
|
||||
return false;
|
||||
}
|
||||
$this->_parsed['charset'] = $charset;
|
||||
return $charset;
|
||||
}
|
||||
|
||||
/**
|
||||
* getDescription
|
||||
*
|
||||
* @access public
|
||||
* @return false|string
|
||||
*/
|
||||
public function getDescription()
|
||||
{
|
||||
// return description found, if any
|
||||
if (isset($this->_parsed['description'])) {
|
||||
return $this->_parsed['description'];
|
||||
}
|
||||
|
||||
// parse description/return
|
||||
$description = $this->_parseDescription();
|
||||
if ($description === false) {
|
||||
return false;
|
||||
}
|
||||
$this->_parsed['description'] = $description;
|
||||
return $description;
|
||||
}
|
||||
|
||||
/**
|
||||
* getDetails
|
||||
*
|
||||
* @access public
|
||||
* @return array
|
||||
*/
|
||||
public function getDetails()
|
||||
{
|
||||
// return relevant meta data
|
||||
return array(
|
||||
'base' => $this->getBase(),
|
||||
'charset' => $this->getCharset(),
|
||||
'favicon' => $this->getFavicon(),
|
||||
'meta' => array(
|
||||
'description' => $this->getDescription(),
|
||||
'keywords' => $this->getKeywords()
|
||||
),
|
||||
'images' => $this->getImages(),
|
||||
'openGraph' => $this->getOpenGraph(),
|
||||
'social' => $this->getSocial(),
|
||||
'title' => $this->getTitle(),
|
||||
'url' => $this->getUrl()
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* getFavicon
|
||||
*
|
||||
* @notes do not need to check for false value after _parseFavicon as
|
||||
* default will always be returned (eg. domain.com/favicon.ico)
|
||||
* @access public
|
||||
* @return false|string
|
||||
*/
|
||||
public function getFavicon()
|
||||
{
|
||||
// return favicon found, if any
|
||||
if (isset($this->_parsed['favicon'])) {
|
||||
return $this->_parsed['favicon'];
|
||||
}
|
||||
|
||||
// parse favicon/return
|
||||
$favicon = $this->_parseFavicon();
|
||||
$this->_parsed['favicon'] = $favicon;
|
||||
return $favicon;
|
||||
}
|
||||
|
||||
/**
|
||||
* getImages
|
||||
*
|
||||
* @access public
|
||||
* @return false|array
|
||||
*/
|
||||
public function getImages()
|
||||
{
|
||||
// return images found, if any
|
||||
if (isset($this->_parsed['images'])) {
|
||||
return $this->_parsed['images'];
|
||||
}
|
||||
|
||||
// parse images/return
|
||||
$images = $this->_parseImages();
|
||||
$this->_parsed['images'] = $images;
|
||||
return $images;
|
||||
}
|
||||
|
||||
/**
|
||||
* getKeywords
|
||||
*
|
||||
* @access public
|
||||
* @return false|array
|
||||
*/
|
||||
public function getKeywords()
|
||||
{
|
||||
// return keywords found, if any
|
||||
if (isset($this->_parsed['keywords'])) {
|
||||
return $this->_parsed['keywords'];
|
||||
}
|
||||
|
||||
// parse keywords if found; encode; return
|
||||
$keywords = $this->_parseKeywords();
|
||||
if ($keywords === false) {
|
||||
return false;
|
||||
}
|
||||
$this->_parsed['keywords'] = $keywords;
|
||||
return $keywords;
|
||||
}
|
||||
|
||||
/**
|
||||
* getOpenGraph
|
||||
*
|
||||
* @access public
|
||||
* @return array
|
||||
*/
|
||||
public function getOpenGraph()
|
||||
{
|
||||
$graph = array();
|
||||
$keys = $this->_parseOpenGraphKeys();
|
||||
foreach ($keys as $key) {
|
||||
$graph[$key] = $this->_parseMetaTag('og:' . ($key), 'property');
|
||||
}
|
||||
|
||||
// resolve path to open graph image, if found
|
||||
if (in_array('image', $keys)) {
|
||||
$graph['imagePath'] = $this->_resolveFullPath(
|
||||
$graph['image'],
|
||||
$this->getBase()
|
||||
);
|
||||
}
|
||||
return $graph;
|
||||
}
|
||||
|
||||
/**
|
||||
* getSocial
|
||||
*
|
||||
* @access public
|
||||
* @return array
|
||||
*/
|
||||
public function getSocial()
|
||||
{
|
||||
return array(
|
||||
'facebook' => $this->_parseSocialNetwork('facebook'),
|
||||
'twitter' => $this->_parseSocialNetwork('twitter'),
|
||||
'instagram' => $this->_parseSocialNetwork('instagram'),
|
||||
'pinterest' => $this->_parseSocialNetwork('pinterest'),
|
||||
'youTube' => $this->_parseYouTubeChannel()
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* getTitle
|
||||
*
|
||||
* @access public
|
||||
* @return false|string
|
||||
*/
|
||||
public function getTitle()
|
||||
{
|
||||
// return title found, if any
|
||||
if (isset($this->_parsed['title'])) {
|
||||
return $this->_parsed['title'];
|
||||
}
|
||||
|
||||
// parse title/return
|
||||
$title = $this->_parseTitle();
|
||||
if ($title === false) {
|
||||
return false;
|
||||
}
|
||||
$this->_parsed['title'] = $title;
|
||||
return $title;
|
||||
}
|
||||
|
||||
/**
|
||||
* getUrl
|
||||
*
|
||||
* @access public
|
||||
* @return string
|
||||
*/
|
||||
public function getUrl()
|
||||
{
|
||||
return $this->_url;
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user