277 lines
6.9 KiB
PHP
277 lines
6.9 KiB
PHP
<?php
|
|
/**
|
|
* Class for parsing CSV files
|
|
*
|
|
* @author Martins Pilsetnieks
|
|
*/
|
|
class SpreadsheetReader_CSV implements Iterator, Countable
|
|
{
|
|
/**
|
|
* @var array Options array, pre-populated with the default values.
|
|
*/
|
|
private $Options = array(
|
|
'Delimiter' => ';',
|
|
'Enclosure' => '"'
|
|
);
|
|
|
|
private $Encoding = 'UTF-8';
|
|
private $BOMLength = 0;
|
|
|
|
/**
|
|
* @var resource File handle
|
|
*/
|
|
private $Handle = false;
|
|
|
|
private $Filepath = '';
|
|
|
|
private $Index = 0;
|
|
|
|
private $CurrentRow = null;
|
|
|
|
/**
|
|
* @param string Path to file
|
|
* @param array Options:
|
|
* Enclosure => string CSV enclosure
|
|
* Separator => string CSV separator
|
|
*/
|
|
public function __construct($Filepath, array $Options = null)
|
|
{
|
|
$this -> Filepath = $Filepath;
|
|
|
|
if (!is_readable($Filepath))
|
|
{
|
|
throw new Exception('SpreadsheetReader_CSV: File not readable ('.$Filepath.')');
|
|
}
|
|
|
|
// For safety's sake
|
|
@ini_set('auto_detect_line_endings', true);
|
|
|
|
$this -> Options = array_merge($this -> Options, $Options);
|
|
$this -> Handle = fopen($Filepath, 'r');
|
|
|
|
// Checking the file for byte-order mark to determine encoding
|
|
$BOM16 = bin2hex(fread($this -> Handle, 2));
|
|
if ($BOM16 == 'fffe')
|
|
{
|
|
$this -> Encoding = 'UTF-16LE';
|
|
//$this -> Encoding = 'UTF-16';
|
|
$this -> BOMLength = 2;
|
|
}
|
|
elseif ($BOM16 == 'feff')
|
|
{
|
|
$this -> Encoding = 'UTF-16BE';
|
|
//$this -> Encoding = 'UTF-16';
|
|
$this -> BOMLength = 2;
|
|
}
|
|
|
|
if (!$this -> BOMLength)
|
|
{
|
|
fseek($this -> Handle, 0);
|
|
$BOM32 = bin2hex(fread($this -> Handle, 4));
|
|
if ($BOM32 == '0000feff')
|
|
{
|
|
//$this -> Encoding = 'UTF-32BE';
|
|
$this -> Encoding = 'UTF-32';
|
|
$this -> BOMLength = 4;
|
|
}
|
|
elseif ($BOM32 == 'fffe0000')
|
|
{
|
|
//$this -> Encoding = 'UTF-32LE';
|
|
$this -> Encoding = 'UTF-32';
|
|
$this -> BOMLength = 4;
|
|
}
|
|
}
|
|
|
|
fseek($this -> Handle, 0);
|
|
$BOM8 = bin2hex(fread($this -> Handle, 3));
|
|
if ($BOM8 == 'efbbbf')
|
|
{
|
|
$this -> Encoding = 'UTF-8';
|
|
$this -> BOMLength = 3;
|
|
}
|
|
|
|
// Seeking the place right after BOM as the start of the real content
|
|
if ($this -> BOMLength)
|
|
{
|
|
fseek($this -> Handle, $this -> BOMLength);
|
|
}
|
|
|
|
// Checking for the delimiter if it should be determined automatically
|
|
if (!$this -> Options['Delimiter'])
|
|
{
|
|
// fgetcsv needs single-byte separators
|
|
$Semicolon = ';';
|
|
$Tab = "\t";
|
|
$Comma = ',';
|
|
|
|
// Reading the first row and checking if a specific separator character
|
|
// has more columns than others (it means that most likely that is the delimiter).
|
|
$SemicolonCount = count(fgetcsv($this -> Handle, null, $Semicolon));
|
|
fseek($this -> Handle, $this -> BOMLength);
|
|
$TabCount = count(fgetcsv($this -> Handle, null, $Tab));
|
|
fseek($this -> Handle, $this -> BOMLength);
|
|
$CommaCount = count(fgetcsv($this -> Handle, null, $Comma));
|
|
fseek($this -> Handle, $this -> BOMLength);
|
|
|
|
$Delimiter = $Semicolon;
|
|
if ($TabCount > $SemicolonCount || $CommaCount > $SemicolonCount)
|
|
{
|
|
$Delimiter = $CommaCount > $TabCount ? $Comma : $Tab;
|
|
}
|
|
|
|
$this -> Options['Delimiter'] = $Delimiter;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Returns information about sheets in the file.
|
|
* Because CSV doesn't have any, it's just a single entry.
|
|
*
|
|
* @return array Sheet data
|
|
*/
|
|
public function Sheets()
|
|
{
|
|
return array(0 => basename($this -> Filepath));
|
|
}
|
|
|
|
/**
|
|
* Changes sheet to another. Because CSV doesn't have any sheets
|
|
* it just rewinds the file so the behaviour is compatible with other
|
|
* sheet readers. (If an invalid index is given, it doesn't do anything.)
|
|
*
|
|
* @param bool Status
|
|
*/
|
|
public function ChangeSheet($Index)
|
|
{
|
|
if ($Index == 0)
|
|
{
|
|
$this -> rewind();
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// !Iterator interface methods
|
|
/**
|
|
* Rewind the Iterator to the first element.
|
|
* Similar to the reset() function for arrays in PHP
|
|
*/
|
|
public function rewind()
|
|
{
|
|
fseek($this -> Handle, $this -> BOMLength);
|
|
$this -> CurrentRow = null;
|
|
$this -> Index = 0;
|
|
}
|
|
|
|
/**
|
|
* Return the current element.
|
|
* Similar to the current() function for arrays in PHP
|
|
*
|
|
* @return mixed current element from the collection
|
|
*/
|
|
public function current()
|
|
{
|
|
if ($this -> Index == 0 && is_null($this -> CurrentRow))
|
|
{
|
|
$this -> next();
|
|
$this -> Index--;
|
|
}
|
|
return $this -> CurrentRow;
|
|
}
|
|
|
|
/**
|
|
* Move forward to next element.
|
|
* Similar to the next() function for arrays in PHP
|
|
*/
|
|
public function next()
|
|
{
|
|
$this -> CurrentRow = array();
|
|
|
|
// Finding the place the next line starts for UTF-16 encoded files
|
|
// Line breaks could be 0x0D 0x00 0x0A 0x00 and PHP could split lines on the
|
|
// first or the second linebreak leaving unnecessary \0 characters that mess up
|
|
// the output.
|
|
if ($this -> Encoding == 'UTF-16LE' || $this -> Encoding == 'UTF-16BE')
|
|
{
|
|
while (!feof($this -> Handle))
|
|
{
|
|
// While bytes are insignificant whitespace, do nothing
|
|
$Char = ord(fgetc($this -> Handle));
|
|
if (!$Char || $Char == 10 || $Char == 13)
|
|
{
|
|
continue;
|
|
}
|
|
else
|
|
{
|
|
// When significant bytes are found, step back to the last place before them
|
|
if ($this -> Encoding == 'UTF-16LE')
|
|
{
|
|
fseek($this -> Handle, ftell($this -> Handle) - 1);
|
|
}
|
|
else
|
|
{
|
|
fseek($this -> Handle, ftell($this -> Handle) - 2);
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
$this -> Index++;
|
|
$this -> CurrentRow = fgetcsv($this -> Handle, null, $this -> Options['Delimiter'], $this -> Options['Enclosure']);
|
|
|
|
if ($this -> CurrentRow)
|
|
{
|
|
// Converting multi-byte unicode strings
|
|
// and trimming enclosure symbols off of them because those aren't recognized
|
|
// in the relevan encodings.
|
|
if ($this -> Encoding != 'ASCII' && $this -> Encoding != 'UTF-8')
|
|
{
|
|
$Encoding = $this -> Encoding;
|
|
foreach ($this -> CurrentRow as $Key => $Value)
|
|
{
|
|
$this -> CurrentRow[$Key] = trim(trim(
|
|
mb_convert_encoding($Value, 'UTF-8', $this -> Encoding),
|
|
$this -> Options['Enclosure']
|
|
));
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
return $this -> CurrentRow;
|
|
}
|
|
|
|
/**
|
|
* Return the identifying key of the current element.
|
|
* Similar to the key() function for arrays in PHP
|
|
*
|
|
* @return mixed either an integer or a string
|
|
*/
|
|
public function key()
|
|
{
|
|
return $this -> Index;
|
|
}
|
|
|
|
/**
|
|
* Check if there is a current element after calls to rewind() or next().
|
|
* Used to check if we've iterated to the end of the collection
|
|
*
|
|
* @return boolean FALSE if there's nothing more to iterate over
|
|
*/
|
|
public function valid()
|
|
{
|
|
return ($this -> CurrentRow || !feof($this -> Handle));
|
|
}
|
|
|
|
// !Countable interface method
|
|
/**
|
|
* Ostensibly should return the count of the contained items but this just returns the number
|
|
* of rows read so far. It's not really correct but at least coherent.
|
|
*/
|
|
public function count()
|
|
{
|
|
return $this -> Index + 1;
|
|
}
|
|
}
|
|
?>
|