857 lines
33 KiB
PHP
857 lines
33 KiB
PHP
<?php
|
|
/**
|
|
* Meta data parsing functions for PDF documents
|
|
*
|
|
* @package Media Library Assistant
|
|
* @since 2.10
|
|
*/
|
|
|
|
/**
|
|
* Class MLA (Media Library Assistant) PDF extracts legacy and XMP meta data from PDF files
|
|
*
|
|
* @package Media Library Assistant
|
|
* @since 2.10
|
|
*/
|
|
class MLAPDF {
|
|
/**
|
|
* Array of PDF indirect objects
|
|
*
|
|
* This array contains all of the indirect object offsets and lengths.
|
|
* The array key is ( object ID * 1000 ) + object generation.
|
|
* The array value is array( number, generation, start, optional /length )
|
|
*
|
|
* @since 2.10
|
|
*
|
|
* @var array
|
|
*/
|
|
private static $pdf_indirect_objects = NULL;
|
|
|
|
/**
|
|
* Parse a cross-reference table subsection into the array of indirect object definitions
|
|
*
|
|
* A cross-reference subsection is a sequence of 20-byte entries, each with offset and generation values.
|
|
* @since 2.10
|
|
*
|
|
* @param string buffer containing the subsection
|
|
* @param integer offset within the buffer of the first entry
|
|
* @param integer number of the first object in the subsection
|
|
* @param integer number of entries in the subsection
|
|
*
|
|
* @return void
|
|
*/
|
|
private static function _parse_pdf_xref_subsection( &$xref_section, $offset, $object_id, $count ) {
|
|
|
|
while ( $count-- ) {
|
|
$match_count = preg_match( '/(\d+) (\d+) (.)/', $xref_section, $matches, 0, $offset);
|
|
|
|
if ( $match_count ) {
|
|
if ( 'n' == $matches[3] ) {
|
|
$key = ( $object_id * 1000 ) + $matches[2];
|
|
if ( ! isset( self::$pdf_indirect_objects[ $key ] ) ) {
|
|
self::$pdf_indirect_objects[ $key ] = array( 'number' => $object_id, 'generation' => (integer) $matches[2], 'start' => (integer) $matches[1] );
|
|
}
|
|
}
|
|
|
|
$object_id++;
|
|
$offset += 20;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Parse a cross-reference table section into the array of indirect object definitions
|
|
*
|
|
* Creates the array of indirect object offsets and lengths
|
|
* @since 2.10
|
|
*
|
|
* @param string full path and file name
|
|
* @param integer offset within the file of the xref id and count entry
|
|
*
|
|
* @return integer length of the section
|
|
*/
|
|
private static function _parse_pdf_xref_section( $file_name, $file_offset ) {
|
|
$xref_max = $chunksize = 16384;
|
|
$xref_section = file_get_contents( $file_name, true, NULL, $file_offset, $chunksize );
|
|
$xref_length = 0;
|
|
|
|
while ( preg_match( '/^[\x00-\x20]*(\d+) (\d+)[\x00-\x20]*/', substr($xref_section, $xref_length), $matches, 0 ) ) {
|
|
$object_id = $matches[1];
|
|
$count = $matches[2];
|
|
$offset = $xref_length + strlen( $matches[0] );
|
|
$xref_length = $offset + ( 20 * $count );
|
|
|
|
if ( $xref_max < $xref_length ) {
|
|
$xref_max += $chunksize;
|
|
$xref_section = file_get_contents( $file_name, true, NULL, $file_offset, $xref_max );
|
|
}
|
|
|
|
self::_parse_pdf_xref_subsection( $xref_section, $offset, $object_id, $count );
|
|
} // while preg_match subsection header
|
|
|
|
return $xref_length;
|
|
}
|
|
|
|
/**
|
|
* Parse a cross-reference steam into the array of indirect object definitions
|
|
*
|
|
* Creates the array of indirect object offsets and lengths
|
|
* @since 2.10
|
|
*
|
|
* @param string full path and file name
|
|
* @param integer offset within the file of the xref id and count entry
|
|
* @param string "/W" entry, representing the size of the fields in a single entry
|
|
*
|
|
* @return integer length of the stream
|
|
*/
|
|
private static function _parse_pdf_xref_stream( $file_name, $file_offset, $entry_parms_string ) {
|
|
$chunksize = 16384;
|
|
$xref_section = file_get_contents( $file_name, true, NULL, $file_offset, $chunksize );
|
|
|
|
if ( 'stream' == substr( $xref_section, 0, 6 ) ) {
|
|
$tag_length = 7;
|
|
if ( chr(0x0D) == $xref_section[6] ) {
|
|
$tag_length++;
|
|
}
|
|
} else {
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* If necessary and possible, expand the $xref_section until it contains the end tag
|
|
*/
|
|
$new_chunksize = $chunksize;
|
|
if ( false === ( $end_tag = strpos( $xref_section, 'endstream', $tag_length ) ) && ( $chunksize == strlen( $xref_section ) ) ) {
|
|
$new_chunksize = $chunksize + $chunksize;
|
|
$xref_section = file_get_contents( $file_name, true, NULL, $file_offset, $new_chunksize );
|
|
while ( false === ( $end_tag = strpos( $xref_section, 'endstream' ) ) && ( $new_chunksize == strlen( $xref_section ) ) ) {
|
|
$new_chunksize = $new_chunksize + $chunksize;
|
|
$xref_section = file_get_contents( $file_name, true, NULL, $file_offset, $new_chunksize );
|
|
} // while not found
|
|
} // if not found
|
|
|
|
if ( false == $end_tag ) {
|
|
$length = 0;
|
|
} else {
|
|
$length = $end_tag - $tag_length;
|
|
}
|
|
|
|
if ( false == $end_tag ) {
|
|
return 0;
|
|
}
|
|
|
|
return $length;
|
|
|
|
$entry_parms = explode( ' ', $entry_parms_string );
|
|
$object_id = $matches[1];
|
|
$count = $matches[2];
|
|
$offset = strlen( $matches[0] );
|
|
$length = $offset + ( 20 * $count );
|
|
|
|
if ( $chunksize < $length ) {
|
|
$xref_section = file_get_contents( $file_name, true, NULL, $file_offset, $length );
|
|
$offset = 0;
|
|
}
|
|
|
|
while ( $count-- ) {
|
|
$match_count = preg_match( '/(\d+) (\d+) (.)/', $xref_section, $matches, 0, $offset);
|
|
if ( $match_count ) {
|
|
if ( 'n' == $matches[3] ) {
|
|
$key = ( $object_id * 1000 ) + $matches[2];
|
|
if ( ! isset( self::$pdf_indirect_objects[ $key ] ) ) {
|
|
self::$pdf_indirect_objects[ $key ] = array( 'number' => $object_id, 'generation' => (integer) $matches[2], 'start' => (integer) $matches[1] );
|
|
}
|
|
}
|
|
|
|
$object_id++;
|
|
$offset += 20;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
return $length;
|
|
}
|
|
|
|
/**
|
|
* Build an array of indirect object definitions
|
|
*
|
|
* Creates the array of indirect object offsets and lengths
|
|
* @since 2.10
|
|
*
|
|
* @param string The entire PDF document, passsed by reference
|
|
*
|
|
* @return void
|
|
*/
|
|
private static function _build_pdf_indirect_objects( &$string ) {
|
|
if ( ! is_null( self::$pdf_indirect_objects ) ) {
|
|
return;
|
|
}
|
|
|
|
$match_count = preg_match_all( '!(\d+)\\h+(\d+)\\h+obj|endobj|stream(\x0D\x0A|\x0A)|endstream!', $string, $matches, PREG_OFFSET_CAPTURE );
|
|
self::$pdf_indirect_objects = array();
|
|
$object_level = 0;
|
|
$is_stream = false;
|
|
for ( $index = 0; $index < $match_count; $index++ ) {
|
|
if ( $is_stream ) {
|
|
if ( 'endstream' == substr( $matches[0][ $index ][0], 0, 9 ) ) {
|
|
$is_stream = false;
|
|
}
|
|
} elseif ( 'endobj' == substr( $matches[0][ $index ][0], 0, 6 ) ) {
|
|
$object_level--;
|
|
$object_entry['/length'] = $matches[0][ $index ][1] - $object_entry['start'];
|
|
self::$pdf_indirect_objects[ ($object_entry['number'] * 1000) + $object_entry['generation'] ] = $object_entry;
|
|
} elseif ( 'obj' == substr( $matches[0][ $index ][0], -3 ) ) {
|
|
$object_level++;
|
|
$object_entry = array(
|
|
'number' => $matches[1][ $index ][0],
|
|
'generation' => $matches[2][ $index ][0],
|
|
'start' => $matches[0][ $index ][1] + strlen( $matches[0][ $index ][0] )
|
|
);
|
|
} elseif ( 'stream' == substr( $matches[0][ $index ][0], 0, 6 ) ) {
|
|
$is_stream = true;
|
|
} else {
|
|
/* translators: 1: ERROR tag 2: index */
|
|
MLACore::mla_debug_add( sprintf( _x( '%1$s: _build_pdf_indirect_objects bad value at $index = "%2$d".', 'error_log', 'media-library-assistant' ), __( 'ERROR', 'media-library-assistant' ), $index ), MLACore::MLA_DEBUG_CATEGORY_ANY );
|
|
}
|
|
} // for each match
|
|
}
|
|
|
|
/**
|
|
* Find the offset, length and contents of an indirect object containing a dictionary
|
|
*
|
|
* The function searches the entire file, if necessary, to find the last/most recent copy of the object.
|
|
* This is required because Adobe Acrobat does NOT increment the generation number when it reuses an object.
|
|
*
|
|
* @since 2.10
|
|
*
|
|
* @param string full path and file name
|
|
* @param integer The object number
|
|
* @param integer The object generation number; default zero (0)
|
|
* @param integer The desired object instance (when multiple instances are present); default "highest/latest"
|
|
*
|
|
* @return mixed NULL on failure else array( 'start' => offset in the file, 'length' => object length, 'content' => dictionary contents )
|
|
*/
|
|
private static function _find_pdf_indirect_dictionary( $file_name, $object, $generation = 0, $instance = NULL ) {
|
|
$chunksize = 16384;
|
|
$key = ( $object * 1000 ) + $generation;
|
|
if ( isset( self::$pdf_indirect_objects ) && isset( self::$pdf_indirect_objects[ $key ] ) ) {
|
|
$file_offset = self::$pdf_indirect_objects[ $key ]['start'];
|
|
} else { // found object location
|
|
$file_offset = 0;
|
|
}
|
|
|
|
$object_starts = array();
|
|
$object_content = file_get_contents( $file_name, true, NULL, $file_offset, $chunksize );
|
|
//error_log( __LINE__ . " MLAPDF::_find_pdf_indirect_dictionary( {$file_name}, {$file_offset} ) object_content = \r\n" . MLAData::mla_hex_dump( $object_content ), 0 );
|
|
|
|
/*
|
|
* Match the object header
|
|
*/
|
|
$pattern = sprintf( '!%1$d\\h+%2$d\\h+obj[\\x00-\\x20]*(<<)!', $object, $generation );
|
|
//error_log( __LINE__ . " MLAPDF::_find_pdf_indirect_dictionary( {$object}, {$generation} ) pattern = " . var_export( $pattern, true ), 0 );
|
|
$match_count = preg_match( $pattern, $object_content, $matches, PREG_OFFSET_CAPTURE );
|
|
//error_log( __LINE__ . " MLAPDF::_find_pdf_indirect_dictionary( {$match_count} ) matches = " . var_export( $matches, true ), 0 );
|
|
if ( $match_count ) {
|
|
$object_starts[] = array( 'offset' => $file_offset, 'start' => $matches[1][1]);
|
|
//error_log( __LINE__ . " MLAPDF::_find_pdf_indirect_dictionary( {$file_offset}, {$matches[1][1]} ) object_content = \r\n" . MLAData::mla_hex_dump( substr( $object_content, $matches[1][1] ), 512 ), 0 );
|
|
$match_count = 0;
|
|
}
|
|
|
|
/*
|
|
* If necessary and possible, advance the $object_content through the file until it contains the start tag
|
|
*/
|
|
if ( 0 == $match_count && ( $chunksize == strlen( $object_content ) ) ) {
|
|
$file_offset += ( $chunksize - 16 );
|
|
$object_content = file_get_contents( $file_name, true, NULL, $file_offset, $chunksize );
|
|
$match_count = preg_match( $pattern, $object_content, $matches, PREG_OFFSET_CAPTURE );
|
|
//error_log( __LINE__ . " MLAPDF::_find_pdf_indirect_dictionary( {$match_count} ) matches = " . var_export( $matches, true ), 0 );
|
|
|
|
if ( $match_count ) {
|
|
$object_starts[] = array( 'offset' => $file_offset, 'start' => $matches[1][1]);
|
|
//error_log( __LINE__ . " MLAPDF::_find_pdf_indirect_dictionary( {$file_offset}, {$matches[1][1]} ) object_content = \r\n" . MLAData::mla_hex_dump( substr( $object_content, $matches[1][1] ), 512 ), 0 );
|
|
$match_count = 0;
|
|
}
|
|
|
|
while ( 0 == $match_count && ( $chunksize == strlen( $object_content ) ) ) {
|
|
$file_offset += ( $chunksize - 16 );
|
|
$object_content = file_get_contents( $file_name, true, NULL, $file_offset, $chunksize );
|
|
$match_count = preg_match( $pattern, $object_content, $matches, PREG_OFFSET_CAPTURE );
|
|
//error_log( __LINE__ . " MLAPDF::_find_pdf_indirect_dictionary( {$match_count} ) matches = " . var_export( $matches, true ), 0 );
|
|
|
|
if ( $match_count ) {
|
|
$object_starts[] = array( 'offset' => $file_offset, 'start' => $matches[1][1]);
|
|
//error_log( __LINE__ . " MLAPDF::_find_pdf_indirect_dictionary( {$file_offset}, {$matches[1][1]} ) object_content = \r\n" . MLAData::mla_hex_dump( substr( $object_content, $matches[1][1] ), 512 ), 0 );
|
|
$match_count = 0;
|
|
}
|
|
} // while not found
|
|
} // if not found
|
|
//error_log( __LINE__ . " MLAPDF::_find_pdf_indirect_dictionary object_starts = " . var_export( $object_starts, true ), 0 );
|
|
|
|
/*
|
|
* Return the highest/latest instance unless a specific instance is requested
|
|
*/
|
|
$object_count = count( $object_starts );
|
|
if ( is_null( $instance ) ) {
|
|
$object_start = array_pop( $object_starts );
|
|
} else {
|
|
$instance = absint( $instance );
|
|
$object_start = isset( $object_starts[ $instance ] ) ? $object_starts[ $instance ] : NULL;
|
|
}
|
|
|
|
if ( is_null( $object_start ) ) {
|
|
return NULL;
|
|
} else {
|
|
$file_offset = $object_start['offset'];
|
|
$object_content = file_get_contents( $file_name, true, NULL, $file_offset, $chunksize );
|
|
$start = $object_start['start'];
|
|
}
|
|
|
|
/*
|
|
* If necessary and possible, expand the $object_content until it contains the end tag
|
|
*/
|
|
$pattern = '!>>[\\x00-\\x20]*[endobj|stream]!';
|
|
$match_count = preg_match( $pattern, $object_content, $matches, PREG_OFFSET_CAPTURE, $start );
|
|
if ( 0 == $match_count && ( $chunksize == strlen( $object_content ) ) ) {
|
|
$file_offset = $file_offset + $start;
|
|
$start = 0;
|
|
$new_chunksize = $chunksize + $chunksize;
|
|
$object_content = file_get_contents( $file_name, true, NULL, $file_offset, $new_chunksize );
|
|
$match_count = preg_match( $pattern, $object_content, $matches, PREG_OFFSET_CAPTURE, $start );
|
|
|
|
while ( 0 == $match_count && ( $new_chunksize == strlen( $object_content ) ) ) {
|
|
$new_chunksize = $new_chunksize + $chunksize;
|
|
$object_content = file_get_contents( $file_name, true, NULL, $file_offset, $new_chunksize );
|
|
$match_count = preg_match( $pattern, $object_content, $matches, PREG_OFFSET_CAPTURE, $start );
|
|
} // while not found
|
|
} // if not found
|
|
|
|
if ( 0 == $match_count ) {
|
|
return NULL;
|
|
}
|
|
|
|
if ($match_count) {
|
|
$results = array( 'count' => $object_count, 'start' => $file_offset + $start, 'length' => ($matches[0][1] + 2) - $start );
|
|
$results['content'] = substr( $object_content, $start, $results['length'] );
|
|
//error_log( __LINE__ . " MLAPDF::_find_pdf_indirect_dictionary results = " . var_export( $results, true ), 0 );
|
|
return $results;
|
|
} // found trailer
|
|
|
|
return NULL;
|
|
}
|
|
|
|
/**
|
|
* Parse a PDF Unicode (16-bit Big Endian) object
|
|
*
|
|
* @since 2.10
|
|
*
|
|
* @param string PDF string of 16-bit characters
|
|
*
|
|
* @return string UTF-8 encoded string
|
|
*/
|
|
private static function _parse_pdf_UTF16BE( &$source_string ) {
|
|
$output = '';
|
|
for ($index = 2; $index < strlen( $source_string ); ) {
|
|
$value = ( ord( $source_string[ $index++ ] ) << 8 ) + ord( $source_string[ $index++ ] );
|
|
if ( $value < 0x80 ) {
|
|
$output .= chr( $value );
|
|
} elseif ( $value < 0x100 ) {
|
|
$output .= MLAData::$utf8_chars[ $value - 0x80 ];
|
|
} else {
|
|
$output .= '.'; // TODO encode the rest
|
|
}
|
|
}
|
|
|
|
return $output;
|
|
}
|
|
|
|
/**
|
|
* Parse a PDF string object
|
|
*
|
|
* Returns an array with one dictionary entry. The array also has a '/length' element containing
|
|
* the number of bytes occupied by the string in the source string, including the enclosing parentheses.
|
|
*
|
|
* @since 2.10
|
|
*
|
|
* @param string data within which the string occurs
|
|
* @param integer offset within the source string of the opening '(' character.
|
|
*
|
|
* @return array ( key => array( 'type' => type, 'value' => value, '/length' => length ) ) for the string
|
|
*/
|
|
private static function _parse_pdf_string( &$source_string, $offset ) {
|
|
if ( '(' != $source_string[ $offset ] ) {
|
|
return array( 'type' => 'unknown', 'value' => '', '/length' => 0 );
|
|
}
|
|
|
|
/*
|
|
* Brute force, here we come...
|
|
*/
|
|
$output = '';
|
|
$level = 0;
|
|
$in_string = true;
|
|
$index = $offset + 1;
|
|
while ( $in_string ) {
|
|
$byte = $source_string[ $index++ ];
|
|
if ( '\\' == $byte ) {
|
|
switch ( $source_string[ $index ] ) {
|
|
case chr( 0x0A ):
|
|
if ( chr( 0x0D ) == $source_string[ $index + 1 ] ) {
|
|
$index++;
|
|
}
|
|
|
|
break;
|
|
case chr( 0x0D ):
|
|
if ( chr( 0x0A ) == $source_string[ $index + 1 ] ) {
|
|
$index++;
|
|
}
|
|
|
|
break;
|
|
case 'n':
|
|
$output .= chr( 0x0A );
|
|
break;
|
|
case 'r':
|
|
$output .= chr( 0x0D );
|
|
break;
|
|
case 't':
|
|
$output .= chr( 0x09 );
|
|
break;
|
|
case 'b':
|
|
$output .= chr( 0x08 );
|
|
break;
|
|
case 'f':
|
|
$output .= chr( 0x0C );
|
|
break;
|
|
default: // could be a 1- to 3-digit octal value
|
|
$digit_limit = $index + 3;
|
|
$digit_index = $index;
|
|
while ( $digit_index < $digit_limit ) {
|
|
if ( ! ctype_digit( $source_string[ $digit_index ] ) ) {
|
|
break;
|
|
} else {
|
|
$digit_index++;
|
|
}
|
|
}
|
|
|
|
if ( $digit_count = $digit_index - $index ) {
|
|
$output .= chr( octdec( substr( $source_string, $index, $digit_count ) ) );
|
|
$index += $digit_count - 1;
|
|
} else { // accept the character following the backslash
|
|
$output .= $source_string[ $index ];
|
|
}
|
|
} // switch
|
|
|
|
$index++;
|
|
} else { // REVERSE SOLIDUS
|
|
if ( '(' == $byte ) {
|
|
$level++;
|
|
} elseif ( ')' == $byte ) {
|
|
if ( 0 == $level-- ) {
|
|
$in_string = false;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
$output .= $byte;
|
|
} // just another 8-bit value, but check for balanced parentheses
|
|
} // $in_string
|
|
|
|
return array( 'type' => 'string', 'value' => $output, '/length' => $index - $offset );
|
|
}
|
|
|
|
/**
|
|
* Parse a PDF Linearization Parameter Dictionary object
|
|
*
|
|
* Returns an array of dictionary contents, classified by object type: boolean, numeric, string, hex (string),
|
|
* indirect (object), name, array, dictionary, stream, and null.
|
|
* The array also has a '/length' element containing the number of bytes occupied by the
|
|
* dictionary in the source string, excluding the enclosing delimiters, if passed in.
|
|
* @since 2.10
|
|
*
|
|
* @param string data within which the object occurs, typically the start of a PDF document
|
|
* @param integer filesize of the PDF document, for validation purposes, or zero (0) to ignore filesize
|
|
*
|
|
* @return mixed array of dictionary objects on success, false on failure
|
|
*/
|
|
private static function _parse_pdf_LPD_dictionary( &$source_string, $filesize ) {
|
|
$header = substr( $source_string, 0, 1024 );
|
|
$match_count = preg_match( '!obj[\x00-\x20]*<<(/Linearized).*(>>)[\x00-\x20]*endobj!', $header, $matches, PREG_OFFSET_CAPTURE );
|
|
|
|
if ( $match_count ) {
|
|
$LPD = self::_parse_pdf_dictionary( $header, $matches[1][1] );
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Parse a PDF dictionary object
|
|
*
|
|
* Returns an array of dictionary contents, classified by object type: boolean, numeric, string, hex (string),
|
|
* indirect (object), name, array, dictionary, stream, and null.
|
|
* The array also has a '/length' element containing the number of bytes occupied by the
|
|
* dictionary in the source string, excluding the enclosing delimiters.
|
|
*
|
|
* @since 2.10
|
|
*
|
|
* @param string data within which the string occurs
|
|
* @param integer offset within the source string of the opening '<<' characters or the first content character.
|
|
*
|
|
* @return array ( '/length' => length, key => array( 'type' => type, 'value' => value ) ) for each dictionary field
|
|
*/
|
|
private static function _parse_pdf_dictionary( &$source_string, $offset ) {
|
|
/*
|
|
* Find the end of the dictionary
|
|
*/
|
|
if ( '<<' == substr( $source_string, $offset, 2 ) ) {
|
|
$nest = $offset + 2;
|
|
} else {
|
|
$nest = $offset;
|
|
}
|
|
|
|
$level = 1;
|
|
do {
|
|
$dictionary_end = strpos( $source_string, '>>', $nest );
|
|
if ( false === $dictionary_end ) {
|
|
/* translators: 1: ERROR tag 2: source offset 3: nest level */
|
|
MLACore::mla_debug_add( sprintf( _x( '%1$s: _parse_pdf_dictionary offset = %2$d, nest = %3$d.', 'error_log', 'media-library-assistant' ), __( 'ERROR', 'media-library-assistant' ), $offset, $nest ), MLACore::MLA_DEBUG_CATEGORY_ANY );
|
|
/* translators: 1: ERROR tag 2: dictionary excerpt */
|
|
MLACore::mla_debug_add( sprintf( _x( '%1$s: _parse_pdf_dictionary no end delimiter dump = %2$s.', 'error_log', 'media-library-assistant' ), __( 'ERROR', 'media-library-assistant' ), MLAData::mla_hex_dump( substr( $source_string, $offset, 128 ), 128, 16 ) ), MLACore::MLA_DEBUG_CATEGORY_ANY );
|
|
return array( '/length' => 0 );
|
|
}
|
|
|
|
$nest = strpos( $source_string, '<<', $nest );
|
|
if ( false === $nest ) {
|
|
$nest = $dictionary_end + 2;
|
|
$level--;
|
|
} elseif ( $nest < $dictionary_end ) {
|
|
$nest += 2;
|
|
$level++;
|
|
} else {
|
|
$nest = $dictionary_end + 2;
|
|
$level--;
|
|
}
|
|
} while ( $level );
|
|
|
|
$dictionary_length = $dictionary_end + 2 - $offset;
|
|
$dictionary = array();
|
|
|
|
// \x00-\x20 for whitespace
|
|
// \(|\)|\<|\>|\[|\]|\{|\}|\/|\% for delimiters
|
|
$match_count = preg_match_all( '!/([^\x00-\x20|\(|\)|\<|\>|\[|\]|\{|\}|\/|\%]*)([\x00-\x20]*)!', substr( $source_string, $offset, $dictionary_length ), $matches, PREG_OFFSET_CAPTURE );
|
|
$end_data = -1;
|
|
for ( $match_index = 0; $match_index < $match_count; $match_index++ ) {
|
|
$name = $matches[1][ $match_index ][0];
|
|
$value_start = $offset + $matches[2][ $match_index ][1] + strlen( $matches[2][ $match_index ][0] );
|
|
|
|
/*
|
|
* Skip over false matches within a string or nested dictionary
|
|
*/
|
|
if ( $value_start < $end_data ) {
|
|
continue;
|
|
}
|
|
|
|
$end_data = -1;
|
|
$value_count = preg_match(
|
|
'!(\/?[^\/\x0D\x0A]*)!',
|
|
substr( $source_string, $value_start, ($dictionary_end - $value_start ) ), $value_matches, PREG_OFFSET_CAPTURE );
|
|
|
|
if ( 1 == $value_count ) {
|
|
$value = trim( $value_matches[0][0] );
|
|
$length = strlen( $value );
|
|
$dictionary[ $name ]['value'] = $value;
|
|
if ( ! isset( $value[0] ) ) {
|
|
/* translators: 1: ERROR tag 2: entry name 3: value excerpt */
|
|
MLACore::mla_debug_add( sprintf( _x( '%1$s: _parse_pdf_dictionary bad value [ %2$s ] dump = %3$s', 'error_log', 'media-library-assistant' ), __( 'ERROR', 'media-library-assistant' ), $name, MLAData::mla_hex_dump( $value, 32, 16 ) ), MLACore::MLA_DEBUG_CATEGORY_ANY );
|
|
continue;
|
|
}
|
|
|
|
if ( in_array( $value, array( 'true', 'false' ) ) ) {
|
|
$dictionary[ $name ]['type'] = 'boolean';
|
|
} elseif ( is_numeric( $value ) ) {
|
|
$dictionary[ $name ]['type'] = 'numeric';
|
|
} elseif ( '(' == $value[0] ) {
|
|
$dictionary[ $name ] = self::_parse_pdf_string( $source_string, $value_start );
|
|
$end_data = $value_start + $dictionary[ $name ]['/length'];
|
|
unset( $dictionary[ $name ]['/length'] );
|
|
} elseif ( '<' == $value[0] ) {
|
|
if ( '<' == $value[1] ) {
|
|
$dictionary[ $name ]['value'] = self::_parse_pdf_dictionary( $source_string, $value_start );
|
|
$dictionary[ $name ]['type'] = 'dictionary';
|
|
$end_data = $value_start + 4 + $dictionary[ $name ]['value']['/length'];
|
|
unset( $dictionary[ $name ]['value']['/length'] );
|
|
} else {
|
|
$dictionary[ $name ]['type'] = 'hex';
|
|
}
|
|
} elseif ( '/' == $value[0] ) {
|
|
$dictionary[ $name ]['value'] = substr( $value, 1 );
|
|
$dictionary[ $name ]['type'] = 'name';
|
|
$match_index++; // Skip to the next key
|
|
} elseif ( '[' == $value[0] ) {
|
|
$dictionary[ $name ]['type'] = 'array';
|
|
$array_length = strpos( $source_string, ']', $value_start ) - ($value_start + 1);
|
|
$dictionary[ $name ]['value'] = substr( $source_string, $value_start + 1, $array_length );
|
|
$end_data = 2 + $value_start + $array_length;
|
|
} elseif ( 'null' == $value ) {
|
|
$dictionary[ $name ]['type'] = 'null';
|
|
} elseif ( 'stream' == substr( $value, 0, 6 ) ) {
|
|
$dictionary[ $name ]['type'] = 'stream';
|
|
} else {
|
|
$object_count = preg_match( '!(\d+)\h+(\d+)\h+R!', $value, $object_matches );
|
|
|
|
if ( 1 == $object_count ) {
|
|
$dictionary[ $name ]['type'] = 'indirect';
|
|
$dictionary[ $name ]['object'] = $object_matches[1];
|
|
$dictionary[ $name ]['generation'] = $object_matches[2];
|
|
} else {
|
|
$dictionary[ $name ]['type'] = 'unknown';
|
|
}
|
|
}
|
|
} else {
|
|
$dictionary[ $matches[1][ $match_index ][0] ] = array( 'value' => '' );
|
|
$dictionary[ $matches[1][ $match_index ][0] ]['type'] = 'nomatch';
|
|
}
|
|
} // foreach match
|
|
|
|
$dictionary['/length'] = $dictionary_length;
|
|
return $dictionary;
|
|
}
|
|
|
|
/**
|
|
* Extract dictionary from traditional cross-reference + trailer documents
|
|
*
|
|
* @since 2.10
|
|
*
|
|
* @param string full path to the desired file
|
|
* @param integer offset within file of the cross-reference table
|
|
*
|
|
* @return mixed array of "PDF dictionary arrays", newest first, or NULL on failure
|
|
*/
|
|
private static function _extract_pdf_trailer( $file_name, $file_offset ) {
|
|
$chunksize = 16384;
|
|
$tail = file_get_contents( $file_name, true, NULL, $file_offset, $chunksize );
|
|
$chunk_offset = 0;
|
|
|
|
/*
|
|
* look for traditional xref and trailer
|
|
*/
|
|
if ( 'xref' == substr( $tail, $chunk_offset, 4 ) ) {
|
|
$xref_length = self::_parse_pdf_xref_section( $file_name, $file_offset + $chunk_offset + 4 );
|
|
//error_log( __LINE__ . " MLAPDF::_extract_pdf_trailer xref_length = " . var_export( $xref_length, true ), 0 );
|
|
$chunk_offset += 4 + $xref_length;
|
|
|
|
if ( $chunk_offset > ( $chunksize - 1024 ) ) {
|
|
$file_offset += $chunk_offset;
|
|
$tail = file_get_contents( $file_name, true, NULL, $file_offset, $chunksize );
|
|
$chunk_offset = 0;
|
|
}
|
|
//error_log( __LINE__ . " MLAPDF::_extract_pdf_trailer( {$file_offset} ) tail = \r\n" . MLAData::mla_hex_dump( $tail, 0, 16, 0 ), 0 );
|
|
|
|
$match_count = preg_match( '/[\x00-\x20]*trailer[\x00-\x20]+/', $tail, $matches, PREG_OFFSET_CAPTURE, $chunk_offset );
|
|
//error_log( __LINE__ . " MLAPDF::_extract_pdf_trailer( {$match_count} ) matches = " . var_export( $matches, true ), 0 );
|
|
if ( $match_count ) {
|
|
$chunk_offset = $matches[0][1] + strlen( $matches[0][0] );
|
|
$dictionary = self::_parse_pdf_dictionary( $tail, $chunk_offset );
|
|
//error_log( __LINE__ . " MLAPDF::_extract_pdf_trailer dictionary = " . var_export( $dictionary, true ), 0 );
|
|
|
|
if ( isset( $dictionary['Prev'] ) ) {
|
|
$other_trailers = self::_extract_pdf_trailer( $file_name, $dictionary['Prev']['value'] );
|
|
} else {
|
|
$other_trailers = NULL;
|
|
}
|
|
|
|
if ( is_array( $other_trailers ) ) {
|
|
$other_trailers = array_merge( $other_trailers, array( $dictionary ) );
|
|
return $other_trailers;
|
|
} else {
|
|
return array( $dictionary );
|
|
}
|
|
} // found 'trailer'
|
|
} else { // found 'xref'
|
|
/*
|
|
* Look for a cross-reference stream
|
|
*/
|
|
$match_count = preg_match( '!(\d+)\\h+(\d+)\\h+obj[\x00-\x20]*!', $tail, $matches, PREG_OFFSET_CAPTURE );
|
|
if ( $match_count ) {
|
|
$chunk_offset = $matches[0][1] + strlen( $matches[0][0] );
|
|
|
|
if ( '<<' == substr( $tail, $chunk_offset, 2) ) {
|
|
$dictionary = self::_parse_pdf_dictionary( $tail, $chunk_offset );
|
|
|
|
/*
|
|
* Parse the cross-reference stream following the dictionary, if present
|
|
*/
|
|
if ( isset( $dictionary['Type'] ) && 'XRef' == $dictionary['Type']['value'] ) {
|
|
$xref_length = self::_parse_pdf_xref_stream( $file_name, $file_offset + $chunk_offset + (integer) $dictionary['/length'], $dictionary['W']['value'] );
|
|
}
|
|
|
|
if ( isset( $dictionary['Prev'] ) ) {
|
|
$other_trailers = self::_extract_pdf_trailer( $file_name, $dictionary['Prev']['value'] );
|
|
} else {
|
|
$other_trailers = NULL;
|
|
}
|
|
|
|
if ( is_array( $other_trailers ) ) {
|
|
$other_trailers = array_merge( array( $dictionary ), $other_trailers );
|
|
return $other_trailers;
|
|
} else {
|
|
return array( $dictionary );
|
|
}
|
|
} // found cross-reference stream dictionary
|
|
} // found cross-reference stream object
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
/**
|
|
* Extract Metadata from a PDF file
|
|
*
|
|
* @since 2.10
|
|
*
|
|
* @param string full path to the desired file
|
|
*
|
|
* @return array ( 'xmp' => array( key => value ), 'pdf' => array( key => value ) ) for each metadata field, in string format
|
|
*/
|
|
public static function mla_extract_pdf_metadata( $file_name ) {
|
|
$xmp = array();
|
|
$metadata = array();
|
|
self::$pdf_indirect_objects = NULL;
|
|
$chunksize = 16384;
|
|
|
|
if ( ! file_exists( $file_name ) ) {
|
|
return array( 'xmp' => $xmp, 'pdf' => $metadata );
|
|
}
|
|
|
|
$filesize = filesize( $file_name );
|
|
$file_offset = ( $chunksize < $filesize ) ? ( $filesize - $chunksize ) : 0;
|
|
$tail = file_get_contents( $file_name, false, NULL, $file_offset );
|
|
//error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata( {$file_name}, {$file_offset} ) tail = \r\n" . MLAData::mla_hex_dump( $tail ), 0 );
|
|
|
|
if ( 0 == $file_offset ) {
|
|
$header = substr( $tail, 0, 128 );
|
|
} else {
|
|
$header = file_get_contents( $file_name, false, NULL, 0, 128 );
|
|
}
|
|
//error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata( {$file_name}, {$file_offset} ) header = \r\n" . MLAData::mla_hex_dump( $header ), 0 );
|
|
|
|
if ( '%PDF-' == substr( $header, 0, 5 ) ) {
|
|
$metadata['PDF_Version'] = substr( $header, 1, 7 );
|
|
$metadata['PDF_VersionNumber'] = substr( $header, 5, 3 );
|
|
}
|
|
|
|
/*
|
|
* Find the xref and (optional) trailer
|
|
*/
|
|
$match_count = preg_match_all( '/startxref[\x00-\x20]+(\d+)[\x00-\x20]+\%\%EOF/', $tail, $matches, PREG_OFFSET_CAPTURE );
|
|
if ( 0 == $match_count ) {
|
|
/* translators: 1: ERROR tag 2: path and file */
|
|
MLACore::mla_debug_add( sprintf( _x( '%1$s: File "%2$s", startxref not found.', 'error_log', 'media-library-assistant' ), __( 'ERROR', 'media-library-assistant' ), $path ), MLACore::MLA_DEBUG_CATEGORY_ANY );
|
|
return array( 'xmp' => $xmp, 'pdf' => $metadata );
|
|
}
|
|
|
|
$startxref = (integer) $matches[1][ $match_count - 1 ][0];
|
|
$trailer_dictionaries = self::_extract_pdf_trailer( $file_name, $startxref );
|
|
//error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata trailer_dictionaries = " . var_export( $trailer_dictionaries, true ), 0 );
|
|
if ( is_array( $trailer_dictionaries ) ) {
|
|
$info_reference = NULL;
|
|
foreach ( $trailer_dictionaries as $trailer_dictionary ) {
|
|
if ( isset( $trailer_dictionary['Info'] ) ) {
|
|
$info_reference = $trailer_dictionary['Info'];
|
|
break;
|
|
}
|
|
}
|
|
//error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata info_reference = " . var_export( $info_reference, true ), 0 );
|
|
|
|
if ( isset( $info_reference ) ) {
|
|
$info_object = self::_find_pdf_indirect_dictionary( $file_name, $info_reference['object'], $info_reference['generation'] );
|
|
|
|
/*
|
|
* Handle single or multiple Info instances
|
|
*/
|
|
$info_objects = array();
|
|
if ( $info_object ) {
|
|
if ( 1 == $info_object['count'] ) {
|
|
$info_objects[] = $info_object;
|
|
} else {
|
|
for ( $index = 0; $index < $info_object['count']; $index++ ) {
|
|
$info_objects[] = self::_find_pdf_indirect_dictionary( $file_name, $info_reference['object'], $info_reference['generation'], $index );
|
|
}
|
|
}
|
|
}
|
|
//error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata info_objects = " . var_export( $info_objects, true ), 0 );
|
|
|
|
foreach( $info_objects as $info_object ) {
|
|
$info_dictionary = self::_parse_pdf_dictionary( $info_object['content'], 0 );
|
|
//error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata info_dictionary = " . var_export( $info_dictionary, true ), 0 );
|
|
unset( $info_dictionary['/length'] );
|
|
|
|
foreach ( $info_dictionary as $name => $value ) {
|
|
if ( 'string' == $value['type'] ) {
|
|
$prefix = substr( $value['value'], 0, 2 );
|
|
if ( 'D:' == $prefix ) {
|
|
$metadata[ $name ] = MLAData::mla_parse_pdf_date( $value['value'] );
|
|
} elseif ( ( chr(0xFE) . chr(0xFF) ) == $prefix ) {
|
|
$metadata[ $name ] = self::_parse_pdf_UTF16BE( $value['value'] );
|
|
} else {
|
|
$metadata[ $name ] = $value['value'];
|
|
}
|
|
} else {
|
|
$metadata[ $name ] = $value['value'];
|
|
}
|
|
} // each info entry
|
|
} // foreach Info object
|
|
|
|
/*
|
|
* Remove spurious "Filter" dictionaries
|
|
*/
|
|
unset( $metadata['Filter'] );
|
|
unset( $metadata['Length'] );
|
|
unset( $metadata['Length1'] );
|
|
} // found Info reference
|
|
//error_log( __LINE__ . ' MLAPDF::mla_extract_pdf_metadata pdf metadata = ' . var_export( $metadata, true ), 0 );
|
|
|
|
/*
|
|
* Look for XMP Metadata
|
|
*/
|
|
$root_reference = NULL;
|
|
//error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata info_dictionary = " . var_export( $info_dictionary, true ), 0 );
|
|
foreach ( $trailer_dictionaries as $trailer_dictionary ) {
|
|
if ( isset( $trailer_dictionary['Root'] ) ) {
|
|
$root_reference = $trailer_dictionary['Root'];
|
|
break;
|
|
}
|
|
}
|
|
//error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata root_reference = " . var_export( $root_reference, true ), 0 );
|
|
|
|
if ( isset( $root_reference ) ) {
|
|
$root_object = self::_find_pdf_indirect_dictionary( $file_name, $root_reference['object'], $root_reference['generation'] );
|
|
//error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata root_object = " . var_export( $root_object, true ), 0 );
|
|
if ( $root_object ) {
|
|
$root_dictionary = self::_parse_pdf_dictionary( $root_object['content'], 0 );
|
|
//error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata root_dictionary = " . var_export( $root_dictionary, true ), 0 );
|
|
unset( $root_dictionary['/length'] );
|
|
|
|
if ( isset( $root_dictionary['Metadata'] ) ) {
|
|
$xmp_object = self::_find_pdf_indirect_dictionary( $file_name, $root_dictionary['Metadata']['object'], $root_dictionary['Metadata']['generation'] );
|
|
//error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata xmp_object = " . var_export( $xmp_object, true ), 0 );
|
|
$xmp = MLAData::mla_parse_xmp_metadata( $file_name, $xmp_object['start'] + $xmp_object['length'] );
|
|
|
|
if ( is_array( $xmp ) ) {
|
|
$metadata = array_merge( $metadata, $xmp );
|
|
} else {
|
|
$xmp = array();
|
|
$xmp = MLAData::mla_parse_xmp_metadata( $file_name, 0 );
|
|
//error_log( __LINE__ . ' MLAPDF::mla_extract_pdf_metadata recovered xmp = ' . var_export( $xmp, true ), 0 );
|
|
}
|
|
} // found Metadata reference
|
|
} // found Root object
|
|
} // found Root reference
|
|
} // found trailer_dictionaries
|
|
//error_log( __LINE__ . ' MLAPDF::mla_extract_pdf_metadata pdf = ' . var_export( $metadata, true ), 0 );
|
|
//error_log( __LINE__ . ' MLAPDF::mla_extract_pdf_metadata xmp = ' . var_export( $xmp, true ), 0 );
|
|
|
|
return array( 'xmp' => $xmp, 'pdf' => $metadata );
|
|
}
|
|
} // class MLAPDF
|
|
?>
|