reasonableframework/vendor/PHP-MetaParser/MetaParser.class.php

<?php

    /**
     * MetaParser
     *
     * Parses content for meta and open graph details. Useful when used with a
     * curling library.
     *
     * @see     <https://github.com/onassar/PHP-Curler>
     * @author  Oliver Nassar <onassar@gmail.com>
     * @todo    add support for paths leading with '//' (aka. use same scheme)
     * @notes   The following urls provide good examples of the parsing engine:
     *          http://www.bbc.com/
     *          http://www.nytimes.com/
     *          http://techcrunch.com/
     *          http://metallo.scripps.edu/
     *          http://jobs.businessinsider.com/job/3b9d1c8e1e5e31d4e4fefb4a551b8b90/?d=1&source=site_home
     *          http://www.wikipedia.org
     *          http://yahoo.com
     *          http://twitter.com/wikileaks/status/8920530488926208#s
     *          http://veryawesomeworld.com/awesomebook/inside.html
     * @example
     * <code>
     *     // booting
     *     require_once APP . '/vendors/PHP-Curler/Curler.class.php';
     *     require_once APP . '/vendors/PHP-MetaParser/MetaParser.class.php';
     *
     *     // curling
     *     $curler = new Curler();
     *     $url = 'http://www.bbc.com/';
     *     $content = $curler->get($url);
     *     $parser = new MetaParser($content, $url);
     *     print_r($parser->getDetails());
     * <code>
     */
    class MetaParser
    {
        /**
         * _parsed
         *
         * @var    array
         * @access protected
         */
        protected $_parsed;

        /**
         * _body.
         *
         * @var    string
         * @access protected
         */
        protected $_body;

        /**
         * _url
         *
         * @var    string
         * @access protected
         */
        protected $_url;

        /**
         * __construct
         *
         * Requires the content body and url to be provided. The url is useful
         * to generate relative paths for images, etc.
         *
         * @access public
         * @param  String $body
         * @param  String $url
         * @return void
         */
        public function __construct($body, $url)
        {
            $this->_body = $body;
            $this->_url = $url;
        }

        /**
         * resolveFullPath
         *
         * @see    http://ca3.php.net/manual/en/function.realpath.php#86384
         * @access protected
         * @param  string $addr
         * @param  string $base
         * @return string
         */
        protected function _resolveFullPath($addr, $base)
        {
            // empty address provided
            if (empty($addr) === true) {
                return $base;
            }

            // parse address; if scheme found, doesn't need to be resolved
            $parsed = parse_url($addr);
            if(array_key_exists('scheme', $parsed)) {
                return $addr;
            }

            // parse base passed in (will always be a full url)
            $parsed = parse_url($base);

            // protocol specific
            if (mb_substr($addr, 0, 2) === '//') {
                return ($parsed['scheme']) . '://' . mb_substr($addr, 2);
            }
            // otherwise if the address should go to the top of the tree
            elseif ($addr{0} === '/') {
                return ($parsed['scheme']) . '://' . ($parsed['host']) .
                    ($addr);
            }

            // if the address doesn't contain any sub-directory calls
            if (!strstr($addr, '../')) {
                return ($base) . ($addr);
            }

            // set-up sub-directory pieces for traversing/replacing
            $pieces['addr'] = explode('../', $addr);
            $pieces['base'] = explode('/', $parsed['path']);
            array_pop($pieces['base']);
            $count = count($pieces['addr']) - 1;

            // array of respective sub-directory replacements (from base)
            $replacements = array_slice($pieces['base'], 0, 0 - $count);
            $replacements = array_filter($replacements);

            // add last non-empty sub-directory as tail
            $tail = array_pop($pieces['addr']);
            if (empty($tail) === false) {
                $replacements[] = $tail;
            }

            // return sub-directory traversed address
            return ($parsed['scheme']) . '://' . ($parsed['host']) .
                '/' . implode('/', $replacements);
        }

        /**
         * _parseBase
         *
         * @todo   find url's that have various base values, and test them
         * @access private
         * @return string
         */
        private function _parseBase()
        {
            // search for base tag; return empty string if none found
            preg_match_all(
                '/<base.*href=(\'|")(.*)\1.*>/imU',
                $this->_body,
                $bases
            );

            // store url components (will need to be used before value returned)
            $components = parse_url($this->_url);
            $path = ($components['scheme']) . '://' . ($components['host']);
            if (isset($components['path'])) {
                $path .= $components['path'];
            } else {
                $path .= '/';
            }

            // remove any filename that was specified by the path explicitely
            $path = preg_replace('/[^\/]*$/U', '', $path);

            // no base tags found
            if (empty($bases[2]) === true) {
                return $path;
            }

            // set base
            $base = trim(array_pop($bases[2]));

            // set variable to check for target attribute value
            $found = array_pop($bases[0]);

            // if a target attribute found
            if (preg_match('/target=/', $found)) {

                // check if target being specified is the document itself (which
                // is okay)
                $self = preg_match('/target=[\'"]{1}_self[\'"]{1}/', $found);

                // if it's not itself, set the base based on the url being
                // 'grabbed'
                if (!$self) {
                    return $path;
                }
            }

            // resolve path (check for trailing slash; always required)
            $resolved = $this->_resolveFullPath($base, $path);
            if (!preg_match('/\/$/', $resolved)) {
                return ($resolved) . '/';
            }
            return $resolved;
        }

        /**
         * _parseCharset
         *
         * @access private
         * @return String
         */
        private function _parseCharset()
        {
            // get the page's charset (defined as a meta tag)
            preg_match(
                '#<meta(?!\s*(?:name|value)\s*=)[^>]*?charset\s*=[\s"\']*([^\s"\'/>]*)#i',
                $this->_body,
                $charset
            );
            if (empty($charset) === true) {
                return false;
            }

            // return charset found
            $charset = array_pop($charset);
            $charset = strtolower($charset);
            $charset = trim($charset);
            if ($charset === 'utf8') {
                return 'utf-8';
            }
            return $charset;
        }

        /**
         * _parseDescription
         *
         * @notes  not checking the index of the regular expression that
         *         corresponds to the actual keywords in order to ensure that an
         *         actual meta tag for keywords was specified. This way I can
         *         return false if the meta tag isn't there at all
         *         due to a bug, the second regex does *not* support newlines in
         *         the meta tag content attribute values
         * @access private
         * @return string
         */
        private function _parseDescription()
        {
            // grab meta tag; return immediately if false
            $description = $this->_parseMetaTag('description');
            if ($description === false) {
                return false;
            }

            // trim/return
            return trim($description);
        }

        /**
         * _parseFavicon
         *
         * @access private
         * @return string
         */
        private function _parseFavicon()
        {
            // generate default
            $parsed = parse_url($this->_url);
            $default = ($parsed['scheme']) . '://' . ($parsed['host']) .
                '/favicon.ico';

            // get the page links (icon attribute value leading)
            preg_match_all(
                '/<link.+rel=(\'|").*[^-]\bicon\b.{0,20}href=(\'|")(.+)\2/imU',
                $this->_body,
                $favicons
            );
            if (empty($favicons[3]) === true) {

                // get the page links (icon attribute value trailing)
                preg_match_all(
                    '/<link.+href=(\'|")(.+)\1.{0,20}rel=(\'|").*[^-]\bicon\b/imU',
                    $this->_body,
                    $favicons
                );

                // no favicon found
                if (empty($favicons[2]) === true) {
                    return $default;
                }
                $favicon = array_pop($favicons[2]);
            } else {
                $favicon = array_pop($favicons[3]);
            }

            // resolve full path
            $favicon = trim($favicon);
            $favicon = $this->_resolveFullPath($favicon, $this->getBase());
            $favicon = str_replace(PHP_EOL, '', $favicon);
            return $favicon;
        }

        /**
         * _parseImages
         *
         * @notes  first expression capture relates to [^*]* rather than .* as
         *         this was excluding new lines
         * @access private
         * @return array
         */
        private function _parseImages()
        {
            // get the page images
            preg_match_all(
                '/<img[^*]*src=(\'|")(.+)\1/imU',
                $this->_body,
                $images
            );
            if (empty($images[2]) === true) {
                return array();
            }

            // base the images
            $images = array_pop($images);
            $base = $this->getBase();
            foreach ($images as &$image) {
                $image = $this->_resolveFullPath($image, $base);
                $image = str_replace(PHP_EOL, '', $image);
            }
            $images = array_unique($images);
            $images = array_values($images);

            // return images found
            return $images;
        }

        /**
         * _parseKeywords
         *
         * @notes  not checking the index of the regular expression that
         *         corresponds to the actual keywords in order to ensure that an
         *         actual meta tag for keywords was specified. This way I can
         *         return false if the meta tag isn't there at all
         * @access private
         * @return array
         */
        private function _parseKeywords()
        {
            // grab meta tag; return immediately if false
            $keywords = $this->_parseMetaTag('keywords');
            if ($keywords === false) {
                return false;
            }

            // iterate over them, and set as array using comma as delimiter
            $keywords = explode(',', $keywords);
            foreach ($keywords as &$keyword) {
                $keyword = trim($keyword);
            }
            return $keywords;
        }

        /**
         * _parseMetaTag
         *
         * @access protected
         * @param  string $value
         * @return false|string
         */
        protected function _parseMetaTag($value, $attr = 'name')
        {
            // get the page meta-tag (name attribute leading)
            preg_match_all(
                '/<meta.+' . ($attr) . '=(\'|")(\bdc\.\b)?\b' .
                ($value) . '\b\1.+content=(\'|")(.*)\3/imU',
                $this->_body,
                $tags
            );

            // meta tag not found (not that it's empty, but not-found)
            if (empty($tags[3]) === true) {

                // get the page meta-tag (name attribute trailing)
                preg_match_all(
                    '/<meta.+content=(\'|")(.*)\1.+' . ($attr) .
                    '=(\'|")(\bdc\.\b)?\b' . ($value) . '\b\3.+/imU',
                    $this->_body,
                    $tags
                );

                // no meta-tag found
                if (empty($tags[3]) === true) {
                    return false;
                }

                // return value found
                return array_pop($tags[2]);
            }

            // return meta-tag found
            return array_pop($tags[4]);
        }

        /**
         * _parseOpenGraphKeys
         *
         * @access protected
         * @return array
         */
        protected function _parseOpenGraphKeys()
        {
            preg_match_all('/([\'|"]{1})og:([a-zA-Z0-9\-:_]{1,25})\1/', $this->_body, $keys);
            return array_pop($keys);
        }

        /**
         * _parseSocialNetwork
         *
         * @access public
         * @param  string $network
         * @return string|false
         */
        public function _parseSocialNetwork($network)
        {
            $hosts = array(
                'facebook' => 'facebook.com',
                'twitter' => 'twitter.com',
                'instagram' => 'instagram.com',
                'pinterest' => 'pinterest.com'
            );
            $exemptions = array(
                'facebook' => array(),
                'twitter' => array(),
                'instagram' => array(),
                'pinterest' => array('pin')
            );
            $host = $hosts[$network];
            $host = str_replace('.', '\.', $host);
            $pattern = '/href=(\'|").+' . ($host) . '\/([^"\'\?\#\/]+)\1/U';
            preg_match($pattern, $this->_body, $matches);
            if (count($matches) > 0) {
                $id = array_pop($matches);
                if (in_array($id, $exemptions[$network]) === false) {
                    return $id;
                }
            }
            $pattern = '/.+' . ($host) . '\/([^"\'\?\#\/]+)/';
            preg_match($pattern, $this->_body, $matches);
            if (count($matches) > 0) {
                $id = array_pop($matches);
                if (in_array($id, $exemptions[$network]) === false) {
                    return $id;
                }
            }
            return false;
        }

        /**
         * _parseTitle
         *
         * @access private
         * @return string
         */
        private function _parseTitle()
        {
            // get the page's title
            // preg_match('/<title[^>]*>([^<]+)<\/title>/i', $this->_body, $titles);
            preg_match('/<title[^>]*>([^<]+)<\/title>/im', $this->_body, $titles);
            if (empty($titles) === true) {
                return false;
            }

            // return title found
            return trim(array_pop($titles));
        }

        /**
         * _parseYouTubeChannel
         *
         * @access public
         * @return string|false
         */
        public function _parseYouTubeChannel()
        {
            $host = 'youtube.com';
            $host = str_replace('.', '\.', $host);
            $pattern = '/href="[^"]+' . ($host) . '\/channel[\\\]?\/([^"\?\#\/]+)/';
            preg_match($pattern, $this->_body, $matches);
            if (count($matches) > 0) {
                return array_pop($matches);
            }
            return false;
        }

        /**
         * getBase
         *
         * @notes  do not need to check for false value after _parseFavicon as
         *         default will always be returned (eg. domain.com/favicon.ico)
         * @access public
         * @return false|string
         */
        public function getBase()
        {
            // return base found, if any
            if (isset($this->_parsed['base'])) {
                return $this->_parsed['base'];
            }

            // parse base/return
            $base = $this->_parseBase();
            $this->_parsed['base'] = $base;
            return $base;
        }

        /**
         * getCharset
         *
         * Returns the charset defined in the document, which may or may not be
         * the charset that is rendered by the browser. This is because
         * charsets passed from a server directive supercede those defined in
         * the document.
         *
         * @see    <http://stackoverflow.com/questions/3458217/how-to-use-regular-expression-to-match-the-charset-string-in-html>
         * @access public
         * @return false|string
         */
        public function getCharset()
        {
            // return charset found, if any
            if (isset($this->_parsed['charset'])) {
                return $this->_parsed['charset'];
            }

            // parse charset/return
            $charset = $this->_parseCharset();
            if ($charset === false) {
                return false;
            }
            $this->_parsed['charset'] = $charset;
            return $charset;
        }

        /**
         * getDescription
         *
         * @access public
         * @return false|string
         */
        public function getDescription()
        {
            // return description found, if any
            if (isset($this->_parsed['description'])) {
                return $this->_parsed['description'];
            }

            // parse description/return
            $description = $this->_parseDescription();
            if ($description === false) {
                return false;
            }
            $this->_parsed['description'] = $description;
            return $description;
        }

        /**
         * getDetails
         *
         * @access public
         * @return array
         */
        public function getDetails()
        {
            // return relevant meta data
            return array(
                'base' => $this->getBase(),
                'charset' => $this->getCharset(),
                'favicon' => $this->getFavicon(),
                'meta' => array(
                    'description' => $this->getDescription(),
                    'keywords' => $this->getKeywords()
                ),
                'images' => $this->getImages(),
                'openGraph' => $this->getOpenGraph(),
                'social' => $this->getSocial(),
                'title' => $this->getTitle(),
                'url' => $this->getUrl()
            );
        }

        /**
         * getFavicon
         *
         * @notes  do not need to check for false value after _parseFavicon as
         *         default will always be returned (eg. domain.com/favicon.ico)
         * @access public
         * @return false|string
         */
        public function getFavicon()
        {
            // return favicon found, if any
            if (isset($this->_parsed['favicon'])) {
                return $this->_parsed['favicon'];
            }

            // parse favicon/return
            $favicon = $this->_parseFavicon();
            $this->_parsed['favicon'] = $favicon;
            return $favicon;
        }

        /**
         * getImages
         *
         * @access public
         * @return false|array
         */
        public function getImages()
        {
            // return images found, if any
            if (isset($this->_parsed['images'])) {
                return $this->_parsed['images'];
            }

            // parse images/return
            $images = $this->_parseImages();
            $this->_parsed['images'] = $images;
            return $images;
        }

        /**
         * getKeywords
         *
         * @access public
         * @return false|array
         */
        public function getKeywords()
        {
            // return keywords found, if any
            if (isset($this->_parsed['keywords'])) {
                return $this->_parsed['keywords'];
            }

            // parse keywords if found; encode; return
            $keywords = $this->_parseKeywords();
            if ($keywords === false) {
                return false;
            }
            $this->_parsed['keywords'] = $keywords;
            return $keywords;
        }

        /**
         * getOpenGraph
         *
         * @access public
         * @return array
         */
        public function getOpenGraph()
        {
            $graph = array();
            $keys = $this->_parseOpenGraphKeys();
            foreach ($keys as $key) {
                $graph[$key] = $this->_parseMetaTag('og:' . ($key), 'property');
            }

            // resolve path to open graph image, if found
            if (in_array('image', $keys)) {
                $graph['imagePath'] = $this->_resolveFullPath(
                    $graph['image'],
                    $this->getBase()
                );
            }
            return $graph;
        }

        /**
         * getSocial
         *
         * @access public
         * @return array
         */
        public function getSocial()
        {
            return array(
                'facebook' => $this->_parseSocialNetwork('facebook'),
                'twitter' => $this->_parseSocialNetwork('twitter'),
                'instagram' => $this->_parseSocialNetwork('instagram'),
                'pinterest' => $this->_parseSocialNetwork('pinterest'),
                'youTube' => $this->_parseYouTubeChannel()
            );
        }

        /**
         * getTitle
         *
         * @access public
         * @return false|string
         */
        public function getTitle()
        {
            // return title found, if any
            if (isset($this->_parsed['title'])) {
                return $this->_parsed['title'];
            }

            // parse title/return
            $title = $this->_parseTitle();
            if ($title === false) {
                return false;
            }
            $this->_parsed['title'] = $title;
            return $title;
        }

        /**
         * getUrl
         *
         * @access public
         * @return string
         */
        public function getUrl()
        {
            return $this->_url;
        }
    }