Create MetaParser.class.php

2018-03-21 11:06:44 +09:00 · 2018-03-21 11:06:44 +09:00 · 26992d0a26
commit 26992d0a26
parent 79456367fa
1 changed files with 720 additions and 0 deletions
--- a/vendor/PHP-MetaParser/MetaParser.class.php
+++ b/vendor/PHP-MetaParser/MetaParser.class.php
@ -0,0 +1,720 @@
+<?php
+
+    /**
+     * MetaParser
+     *
+     * Parses content for meta and open graph details. Useful when used with a
+     * curling library.
+     * 
+     * @see     <https://github.com/onassar/PHP-Curler>
+     * @author  Oliver Nassar <onassar@gmail.com>
+     * @todo    add support for paths leading with '//' (aka. use same scheme)
+     * @notes   The following urls provide good examples of the parsing engine:
+     *          http://www.bbc.com/
+     *          http://www.nytimes.com/
+     *          http://techcrunch.com/
+     *          http://metallo.scripps.edu/
+     *          http://jobs.businessinsider.com/job/3b9d1c8e1e5e31d4e4fefb4a551b8b90/?d=1&source=site_home
+     *          http://www.wikipedia.org
+     *          http://yahoo.com
+     *          http://twitter.com/wikileaks/status/8920530488926208#s
+     *          http://veryawesomeworld.com/awesomebook/inside.html
+     * @example
+     * <code>
+     *     // booting
+     *     require_once APP . '/vendors/PHP-Curler/Curler.class.php';
+     *     require_once APP . '/vendors/PHP-MetaParser/MetaParser.class.php';
+     *
+     *     // curling
+     *     $curler = new Curler();
+     *     $url = 'http://www.bbc.com/';
+     *     $content = $curler->get($url);
+     *     $parser = new MetaParser($content, $url);
+     *     print_r($parser->getDetails());
+     * <code>
+     */
+    class MetaParser
+    {
+        /**
+         * _parsed
+         * 
+         * @var    array
+         * @access protected
+         */
+        protected $_parsed;
+
+        /**
+         * _body.
+         * 
+         * @var    string
+         * @access protected
+         */
+        protected $_body;
+
+        /**
+         * _url
+         * 
+         * @var    string
+         * @access protected
+         */
+        protected $_url;
+
+        /**
+         * __construct
+         * 
+         * Requires the content body and url to be provided. The url is useful
+         * to generate relative paths for images, etc.
+         * 
+         * @access public
+         * @param  String $body
+         * @param  String $url
+         * @return void
+         */
+        public function __construct($body, $url)
+        {
+            $this->_body = $body;
+            $this->_url = $url;
+        }
+
+        /**
+         * resolveFullPath
+         * 
+         * @see    http://ca3.php.net/manual/en/function.realpath.php#86384
+         * @access protected
+         * @param  string $addr
+         * @param  string $base
+         * @return string
+         */
+        protected function _resolveFullPath($addr, $base)
+        {
+            // empty address provided
+            if (empty($addr) === true) {
+                return $base;
+            }
+
+            // parse address; if scheme found, doesn't need to be resolved
+            $parsed = parse_url($addr);
+            if(array_key_exists('scheme', $parsed)) {
+                return $addr;
+            }
+
+            // parse base passed in (will always be a full url)
+            $parsed = parse_url($base);
+
+            // protocol specific
+            if (mb_substr($addr, 0, 2) === '//') {
+                return ($parsed['scheme']) . '://' . mb_substr($addr, 2);
+            }
+            // otherwise if the address should go to the top of the tree
+            elseif ($addr{0} === '/') {
+                return ($parsed['scheme']) . '://' . ($parsed['host']) .
+                    ($addr);
+            }
+
+            // if the address doesn't contain any sub-directory calls
+            if (!strstr($addr, '../')) {
+                return ($base) . ($addr);
+            }
+
+            // set-up sub-directory pieces for traversing/replacing
+            $pieces['addr'] = explode('../', $addr);
+            $pieces['base'] = explode('/', $parsed['path']);
+            array_pop($pieces['base']);
+            $count = count($pieces['addr']) - 1;
+
+            // array of respective sub-directory replacements (from base)
+            $replacements = array_slice($pieces['base'], 0, 0 - $count);
+            $replacements = array_filter($replacements);
+
+            // add last non-empty sub-directory as tail
+            $tail = array_pop($pieces['addr']);
+            if (empty($tail) === false) {
+                $replacements[] = $tail;
+            }
+
+            // return sub-directory traversed address
+            return ($parsed['scheme']) . '://' . ($parsed['host']) .
+                '/' . implode('/', $replacements);
+        }
+
+        /**
+         * _parseBase
+         * 
+         * @todo   find url's that have various base values, and test them
+         * @access private
+         * @return string
+         */
+        private function _parseBase()
+        {
+            // search for base tag; return empty string if none found
+            preg_match_all(
+                '/<base.*href=(\'|")(.*)\1.*>/imU',
+                $this->_body,
+                $bases
+            );
+
+            // store url components (will need to be used before value returned)
+            $components = parse_url($this->_url);
+            $path = ($components['scheme']) . '://' . ($components['host']);
+            if (isset($components['path'])) {
+                $path .= $components['path'];
+            } else {
+                $path .= '/';
+            }
+
+            // remove any filename that was specified by the path explicitely
+            $path = preg_replace('/[^\/]*$/U', '', $path);
+
+            // no base tags found
+            if (empty($bases[2]) === true) {
+                return $path;
+            }
+
+            // set base
+            $base = trim(array_pop($bases[2]));
+
+            // set variable to check for target attribute value
+            $found = array_pop($bases[0]);
+
+            // if a target attribute found
+            if (preg_match('/target=/', $found)) {
+
+                // check if target being specified is the document itself (which
+                // is okay)
+                $self = preg_match('/target=[\'"]{1}_self[\'"]{1}/', $found);
+
+                // if it's not itself, set the base based on the url being
+                // 'grabbed'
+                if (!$self) {
+                    return $path;
+                }
+            }
+
+            // resolve path (check for trailing slash; always required)
+            $resolved = $this->_resolveFullPath($base, $path);
+            if (!preg_match('/\/$/', $resolved)) {
+                return ($resolved) . '/';
+            }
+            return $resolved;
+        }
+
+        /**
+         * _parseCharset
+         * 
+         * @access private
+         * @return String
+         */
+        private function _parseCharset()
+        {
+            // get the page's charset (defined as a meta tag)
+            preg_match(
+                '#<meta(?!\s*(?:name|value)\s*=)[^>]*?charset\s*=[\s"\']*([^\s"\'/>]*)#i',
+                $this->_body,
+                $charset
+            );
+            if (empty($charset) === true) {
+                return false;
+            }
+
+            // return charset found
+            $charset = array_pop($charset);
+            $charset = strtolower($charset);
+            $charset = trim($charset);
+            if ($charset === 'utf8') {
+                return 'utf-8';
+            }
+            return $charset;
+        }
+
+        /**
+         * _parseDescription
+         * 
+         * @notes  not checking the index of the regular expression that
+         *         corresponds to the actual keywords in order to ensure that an
+         *         actual meta tag for keywords was specified. This way I can
+         *         return false if the meta tag isn't there at all
+         *         due to a bug, the second regex does *not* support newlines in
+         *         the meta tag content attribute values
+         * @access private
+         * @return string
+         */
+        private function _parseDescription()
+        {
+            // grab meta tag; return immediately if false
+            $description = $this->_parseMetaTag('description');
+            if ($description === false) {
+                return false;
+            }
+
+            // trim/return
+            return trim($description);
+        }
+
+        /**
+         * _parseFavicon
+         * 
+         * @access private
+         * @return string
+         */
+        private function _parseFavicon()
+        {
+            // generate default
+            $parsed = parse_url($this->_url);
+            $default = ($parsed['scheme']) . '://' . ($parsed['host']) .
+                '/favicon.ico';
+
+            // get the page links (icon attribute value leading)
+            preg_match_all(
+                '/<link.+rel=(\'|").*[^-]\bicon\b.{0,20}href=(\'|")(.+)\2/imU',
+                $this->_body,
+                $favicons
+            );
+            if (empty($favicons[3]) === true) {
+
+                // get the page links (icon attribute value trailing)
+                preg_match_all(
+                    '/<link.+href=(\'|")(.+)\1.{0,20}rel=(\'|").*[^-]\bicon\b/imU',
+                    $this->_body,
+                    $favicons
+                );
+
+                // no favicon found
+                if (empty($favicons[2]) === true) {
+                    return $default;
+                }
+                $favicon = array_pop($favicons[2]);
+            } else {
+                $favicon = array_pop($favicons[3]);
+            }
+
+            // resolve full path
+            $favicon = trim($favicon);
+            $favicon = $this->_resolveFullPath($favicon, $this->getBase());
+            $favicon = str_replace(PHP_EOL, '', $favicon);
+            return $favicon;
+        }
+
+        /**
+         * _parseImages
+         * 
+         * @notes  first expression capture relates to [^*]* rather than .* as
+         *         this was excluding new lines
+         * @access private
+         * @return array
+         */
+        private function _parseImages()
+        {
+            // get the page images
+            preg_match_all(
+                '/<img[^*]*src=(\'|")(.+)\1/imU',
+                $this->_body,
+                $images
+            );
+            if (empty($images[2]) === true) {
+                return array();
+            }
+
+            // base the images
+            $images = array_pop($images);
+            $base = $this->getBase();
+            foreach ($images as &$image) {
+                $image = $this->_resolveFullPath($image, $base);
+                $image = str_replace(PHP_EOL, '', $image);
+            }
+            $images = array_unique($images);
+            $images = array_values($images);
+
+            // return images found
+            return $images;
+        }
+
+        /**
+         * _parseKeywords
+         * 
+         * @notes  not checking the index of the regular expression that
+         *         corresponds to the actual keywords in order to ensure that an
+         *         actual meta tag for keywords was specified. This way I can
+         *         return false if the meta tag isn't there at all
+         * @access private
+         * @return array
+         */
+        private function _parseKeywords()
+        {
+            // grab meta tag; return immediately if false
+            $keywords = $this->_parseMetaTag('keywords');
+            if ($keywords === false) {
+                return false;
+            }
+
+            // iterate over them, and set as array using comma as delimiter
+            $keywords = explode(',', $keywords);
+            foreach ($keywords as &$keyword) {
+                $keyword = trim($keyword);
+            }
+            return $keywords;
+        }
+
+        /**
+         * _parseMetaTag
+         * 
+         * @access protected
+         * @param  string $value
+         * @return false|string
+         */
+        protected function _parseMetaTag($value, $attr = 'name')
+        {
+            // get the page meta-tag (name attribute leading)
+            preg_match_all(
+                '/<meta.+' . ($attr) . '=(\'|")(\bdc\.\b)?\b' .
+                ($value) . '\b\1.+content=(\'|")(.*)\3/imU',
+                $this->_body,
+                $tags
+            );
+
+            // meta tag not found (not that it's empty, but not-found)
+            if (empty($tags[3]) === true) {
+
+                // get the page meta-tag (name attribute trailing)
+                preg_match_all(
+                    '/<meta.+content=(\'|")(.*)\1.+' . ($attr) .
+                    '=(\'|")(\bdc\.\b)?\b' . ($value) . '\b\3.+/imU',
+                    $this->_body,
+                    $tags
+                );
+
+                // no meta-tag found
+                if (empty($tags[3]) === true) {
+                    return false;
+                }
+
+                // return value found
+                return array_pop($tags[2]);
+            }
+            
+            // return meta-tag found
+            return array_pop($tags[4]);
+        }
+
+        /**
+         * _parseOpenGraphKeys
+         * 
+         * @access protected
+         * @return array
+         */
+        protected function _parseOpenGraphKeys()
+        {
+            preg_match_all('/([\'|"]{1})og:([a-zA-Z0-9\-:_]{1,25})\1/', $this->_body, $keys);
+            return array_pop($keys);
+        }
+
+        /**
+         * _parseSocialNetwork
+         * 
+         * @access public
+         * @param  string $network
+         * @return string|false
+         */
+        public function _parseSocialNetwork($network)
+        {
+            $hosts = array(
+                'facebook' => 'facebook.com',
+                'twitter' => 'twitter.com',
+                'instagram' => 'instagram.com',
+                'pinterest' => 'pinterest.com'
+            );
+            $exemptions = array(
+                'facebook' => array(),
+                'twitter' => array(),
+                'instagram' => array(),
+                'pinterest' => array('pin')
+            );
+            $host = $hosts[$network];
+            $host = str_replace('.', '\.', $host);
+            $pattern = '/href=(\'|").+' . ($host) . '\/([^"\'\?\#\/]+)\1/U';
+            preg_match($pattern, $this->_body, $matches);
+            if (count($matches) > 0) {
+                $id = array_pop($matches);
+                if (in_array($id, $exemptions[$network]) === false) {
+                    return $id;
+                }
+            }
+            $pattern = '/.+' . ($host) . '\/([^"\'\?\#\/]+)/';
+            preg_match($pattern, $this->_body, $matches);
+            if (count($matches) > 0) {
+                $id = array_pop($matches);
+                if (in_array($id, $exemptions[$network]) === false) {
+                    return $id;
+                }
+            }
+            return false;
+        }
+
+        /**
+         * _parseTitle
+         * 
+         * @access private
+         * @return string
+         */
+        private function _parseTitle()
+        {
+            // get the page's title
+            // preg_match('/<title[^>]*>([^<]+)<\/title>/i', $this->_body, $titles);
+            preg_match('/<title[^>]*>([^<]+)<\/title>/im', $this->_body, $titles);
+            if (empty($titles) === true) {
+                return false;
+            }
+
+            // return title found
+            return trim(array_pop($titles));
+        }
+
+        /**
+         * _parseYouTubeChannel
+         * 
+         * @access public
+         * @return string|false
+         */
+        public function _parseYouTubeChannel()
+        {
+            $host = 'youtube.com';
+            $host = str_replace('.', '\.', $host);
+            $pattern = '/href="[^"]+' . ($host) . '\/channel[\\\]?\/([^"\?\#\/]+)/';
+            preg_match($pattern, $this->_body, $matches);
+            if (count($matches) > 0) {
+                return array_pop($matches);
+            }
+            return false;
+        }
+
+        /**
+         * getBase
+         * 
+         * @notes  do not need to check for false value after _parseFavicon as
+         *         default will always be returned (eg. domain.com/favicon.ico)
+         * @access public
+         * @return false|string
+         */
+        public function getBase()
+        {
+            // return base found, if any
+            if (isset($this->_parsed['base'])) {
+                return $this->_parsed['base'];
+            }
+
+            // parse base/return
+            $base = $this->_parseBase();
+            $this->_parsed['base'] = $base;
+            return $base;
+        }
+
+        /**
+         * getCharset
+         * 
+         * Returns the charset defined in the document, which may or may not be
+         * the charset that is rendered by the browser. This is because
+         * charsets passed from a server directive supercede those defined in
+         * the document.
+         * 
+         * @see    <http://stackoverflow.com/questions/3458217/how-to-use-regular-expression-to-match-the-charset-string-in-html>
+         * @access public
+         * @return false|string
+         */
+        public function getCharset()
+        {
+            // return charset found, if any
+            if (isset($this->_parsed['charset'])) {
+                return $this->_parsed['charset'];
+            }
+
+            // parse charset/return
+            $charset = $this->_parseCharset();
+            if ($charset === false) {
+                return false;
+            }
+            $this->_parsed['charset'] = $charset;
+            return $charset;
+        }
+
+        /**
+         * getDescription
+         * 
+         * @access public
+         * @return false|string
+         */
+        public function getDescription()
+        {
+            // return description found, if any
+            if (isset($this->_parsed['description'])) {
+                return $this->_parsed['description'];
+            }
+
+            // parse description/return
+            $description = $this->_parseDescription();
+            if ($description === false) {
+                return false;
+            }
+            $this->_parsed['description'] = $description;
+            return $description;
+        }
+
+        /**
+         * getDetails
+         * 
+         * @access public
+         * @return array
+         */
+        public function getDetails()
+        {
+            // return relevant meta data
+            return array(
+                'base' => $this->getBase(),
+                'charset' => $this->getCharset(),
+                'favicon' => $this->getFavicon(),
+                'meta' => array(
+                    'description' => $this->getDescription(),
+                    'keywords' => $this->getKeywords()
+                ),
+                'images' => $this->getImages(),
+                'openGraph' => $this->getOpenGraph(),
+                'social' => $this->getSocial(),
+                'title' => $this->getTitle(),
+                'url' => $this->getUrl()
+            );
+        }
+
+        /**
+         * getFavicon
+         * 
+         * @notes  do not need to check for false value after _parseFavicon as
+         *         default will always be returned (eg. domain.com/favicon.ico)
+         * @access public
+         * @return false|string
+         */
+        public function getFavicon()
+        {
+            // return favicon found, if any
+            if (isset($this->_parsed['favicon'])) {
+                return $this->_parsed['favicon'];
+            }
+
+            // parse favicon/return
+            $favicon = $this->_parseFavicon();
+            $this->_parsed['favicon'] = $favicon;
+            return $favicon;
+        }
+
+        /**
+         * getImages
+         * 
+         * @access public
+         * @return false|array
+         */
+        public function getImages()
+        {
+            // return images found, if any
+            if (isset($this->_parsed['images'])) {
+                return $this->_parsed['images'];
+            }
+
+            // parse images/return
+            $images = $this->_parseImages();
+            $this->_parsed['images'] = $images;
+            return $images;
+        }
+
+        /**
+         * getKeywords
+         * 
+         * @access public
+         * @return false|array
+         */
+        public function getKeywords()
+        {
+            // return keywords found, if any
+            if (isset($this->_parsed['keywords'])) {
+                return $this->_parsed['keywords'];
+            }
+
+            // parse keywords if found; encode; return
+            $keywords = $this->_parseKeywords();
+            if ($keywords === false) {
+                return false;
+            }
+            $this->_parsed['keywords'] = $keywords;
+            return $keywords;
+        }
+
+        /**
+         * getOpenGraph
+         * 
+         * @access public
+         * @return array
+         */
+        public function getOpenGraph()
+        {
+            $graph = array();
+            $keys = $this->_parseOpenGraphKeys();
+            foreach ($keys as $key) {
+                $graph[$key] = $this->_parseMetaTag('og:' . ($key), 'property');
+            }
+
+            // resolve path to open graph image, if found
+            if (in_array('image', $keys)) {
+                $graph['imagePath'] = $this->_resolveFullPath(
+                    $graph['image'],
+                    $this->getBase()
+                );
+            }
+            return $graph;
+        }
+
+        /**
+         * getSocial
+         * 
+         * @access public
+         * @return array
+         */
+        public function getSocial()
+        {
+            return array(
+                'facebook' => $this->_parseSocialNetwork('facebook'),
+                'twitter' => $this->_parseSocialNetwork('twitter'),
+                'instagram' => $this->_parseSocialNetwork('instagram'),
+                'pinterest' => $this->_parseSocialNetwork('pinterest'),
+                'youTube' => $this->_parseYouTubeChannel()
+            );
+        }
+
+        /**
+         * getTitle
+         * 
+         * @access public
+         * @return false|string
+         */
+        public function getTitle()
+        {
+            // return title found, if any
+            if (isset($this->_parsed['title'])) {
+                return $this->_parsed['title'];
+            }
+
+            // parse title/return
+            $title = $this->_parseTitle();
+            if ($title === false) {
+                return false;
+            }
+            $this->_parsed['title'] = $title;
+            return $title;
+        }
+
+        /**
+         * getUrl
+         * 
+         * @access public
+         * @return string
+         */
+        public function getUrl()
+        {
+            return $this->_url;
+        }
+    }