Files
wordpress-preseed/wp-content/plugins/media-library-assistant/includes/class-mla-data-pdf.php
2019-10-24 00:12:05 +02:00

857 lines
33 KiB
PHP

<?php
/**
* Meta data parsing functions for PDF documents
*
* @package Media Library Assistant
* @since 2.10
*/
/**
* Class MLA (Media Library Assistant) PDF extracts legacy and XMP meta data from PDF files
*
* @package Media Library Assistant
* @since 2.10
*/
class MLAPDF {
/**
* Array of PDF indirect objects
*
* This array contains all of the indirect object offsets and lengths.
* The array key is ( object ID * 1000 ) + object generation.
* The array value is array( number, generation, start, optional /length )
*
* @since 2.10
*
* @var array
*/
private static $pdf_indirect_objects = NULL;
/**
* Parse a cross-reference table subsection into the array of indirect object definitions
*
* A cross-reference subsection is a sequence of 20-byte entries, each with offset and generation values.
* @since 2.10
*
* @param string buffer containing the subsection
* @param integer offset within the buffer of the first entry
* @param integer number of the first object in the subsection
* @param integer number of entries in the subsection
*
* @return void
*/
private static function _parse_pdf_xref_subsection( &$xref_section, $offset, $object_id, $count ) {
while ( $count-- ) {
$match_count = preg_match( '/(\d+) (\d+) (.)/', $xref_section, $matches, 0, $offset);
if ( $match_count ) {
if ( 'n' == $matches[3] ) {
$key = ( $object_id * 1000 ) + $matches[2];
if ( ! isset( self::$pdf_indirect_objects[ $key ] ) ) {
self::$pdf_indirect_objects[ $key ] = array( 'number' => $object_id, 'generation' => (integer) $matches[2], 'start' => (integer) $matches[1] );
}
}
$object_id++;
$offset += 20;
} else {
break;
}
}
}
/**
* Parse a cross-reference table section into the array of indirect object definitions
*
* Creates the array of indirect object offsets and lengths
* @since 2.10
*
* @param string full path and file name
* @param integer offset within the file of the xref id and count entry
*
* @return integer length of the section
*/
private static function _parse_pdf_xref_section( $file_name, $file_offset ) {
$xref_max = $chunksize = 16384;
$xref_section = file_get_contents( $file_name, true, NULL, $file_offset, $chunksize );
$xref_length = 0;
while ( preg_match( '/^[\x00-\x20]*(\d+) (\d+)[\x00-\x20]*/', substr($xref_section, $xref_length), $matches, 0 ) ) {
$object_id = $matches[1];
$count = $matches[2];
$offset = $xref_length + strlen( $matches[0] );
$xref_length = $offset + ( 20 * $count );
if ( $xref_max < $xref_length ) {
$xref_max += $chunksize;
$xref_section = file_get_contents( $file_name, true, NULL, $file_offset, $xref_max );
}
self::_parse_pdf_xref_subsection( $xref_section, $offset, $object_id, $count );
} // while preg_match subsection header
return $xref_length;
}
/**
* Parse a cross-reference steam into the array of indirect object definitions
*
* Creates the array of indirect object offsets and lengths
* @since 2.10
*
* @param string full path and file name
* @param integer offset within the file of the xref id and count entry
* @param string "/W" entry, representing the size of the fields in a single entry
*
* @return integer length of the stream
*/
private static function _parse_pdf_xref_stream( $file_name, $file_offset, $entry_parms_string ) {
$chunksize = 16384;
$xref_section = file_get_contents( $file_name, true, NULL, $file_offset, $chunksize );
if ( 'stream' == substr( $xref_section, 0, 6 ) ) {
$tag_length = 7;
if ( chr(0x0D) == $xref_section[6] ) {
$tag_length++;
}
} else {
return 0;
}
/*
* If necessary and possible, expand the $xref_section until it contains the end tag
*/
$new_chunksize = $chunksize;
if ( false === ( $end_tag = strpos( $xref_section, 'endstream', $tag_length ) ) && ( $chunksize == strlen( $xref_section ) ) ) {
$new_chunksize = $chunksize + $chunksize;
$xref_section = file_get_contents( $file_name, true, NULL, $file_offset, $new_chunksize );
while ( false === ( $end_tag = strpos( $xref_section, 'endstream' ) ) && ( $new_chunksize == strlen( $xref_section ) ) ) {
$new_chunksize = $new_chunksize + $chunksize;
$xref_section = file_get_contents( $file_name, true, NULL, $file_offset, $new_chunksize );
} // while not found
} // if not found
if ( false == $end_tag ) {
$length = 0;
} else {
$length = $end_tag - $tag_length;
}
if ( false == $end_tag ) {
return 0;
}
return $length;
$entry_parms = explode( ' ', $entry_parms_string );
$object_id = $matches[1];
$count = $matches[2];
$offset = strlen( $matches[0] );
$length = $offset + ( 20 * $count );
if ( $chunksize < $length ) {
$xref_section = file_get_contents( $file_name, true, NULL, $file_offset, $length );
$offset = 0;
}
while ( $count-- ) {
$match_count = preg_match( '/(\d+) (\d+) (.)/', $xref_section, $matches, 0, $offset);
if ( $match_count ) {
if ( 'n' == $matches[3] ) {
$key = ( $object_id * 1000 ) + $matches[2];
if ( ! isset( self::$pdf_indirect_objects[ $key ] ) ) {
self::$pdf_indirect_objects[ $key ] = array( 'number' => $object_id, 'generation' => (integer) $matches[2], 'start' => (integer) $matches[1] );
}
}
$object_id++;
$offset += 20;
} else {
break;
}
}
return $length;
}
/**
* Build an array of indirect object definitions
*
* Creates the array of indirect object offsets and lengths
* @since 2.10
*
* @param string The entire PDF document, passsed by reference
*
* @return void
*/
private static function _build_pdf_indirect_objects( &$string ) {
if ( ! is_null( self::$pdf_indirect_objects ) ) {
return;
}
$match_count = preg_match_all( '!(\d+)\\h+(\d+)\\h+obj|endobj|stream(\x0D\x0A|\x0A)|endstream!', $string, $matches, PREG_OFFSET_CAPTURE );
self::$pdf_indirect_objects = array();
$object_level = 0;
$is_stream = false;
for ( $index = 0; $index < $match_count; $index++ ) {
if ( $is_stream ) {
if ( 'endstream' == substr( $matches[0][ $index ][0], 0, 9 ) ) {
$is_stream = false;
}
} elseif ( 'endobj' == substr( $matches[0][ $index ][0], 0, 6 ) ) {
$object_level--;
$object_entry['/length'] = $matches[0][ $index ][1] - $object_entry['start'];
self::$pdf_indirect_objects[ ($object_entry['number'] * 1000) + $object_entry['generation'] ] = $object_entry;
} elseif ( 'obj' == substr( $matches[0][ $index ][0], -3 ) ) {
$object_level++;
$object_entry = array(
'number' => $matches[1][ $index ][0],
'generation' => $matches[2][ $index ][0],
'start' => $matches[0][ $index ][1] + strlen( $matches[0][ $index ][0] )
);
} elseif ( 'stream' == substr( $matches[0][ $index ][0], 0, 6 ) ) {
$is_stream = true;
} else {
/* translators: 1: ERROR tag 2: index */
MLACore::mla_debug_add( sprintf( _x( '%1$s: _build_pdf_indirect_objects bad value at $index = "%2$d".', 'error_log', 'media-library-assistant' ), __( 'ERROR', 'media-library-assistant' ), $index ), MLACore::MLA_DEBUG_CATEGORY_ANY );
}
} // for each match
}
/**
* Find the offset, length and contents of an indirect object containing a dictionary
*
* The function searches the entire file, if necessary, to find the last/most recent copy of the object.
* This is required because Adobe Acrobat does NOT increment the generation number when it reuses an object.
*
* @since 2.10
*
* @param string full path and file name
* @param integer The object number
* @param integer The object generation number; default zero (0)
* @param integer The desired object instance (when multiple instances are present); default "highest/latest"
*
* @return mixed NULL on failure else array( 'start' => offset in the file, 'length' => object length, 'content' => dictionary contents )
*/
private static function _find_pdf_indirect_dictionary( $file_name, $object, $generation = 0, $instance = NULL ) {
$chunksize = 16384;
$key = ( $object * 1000 ) + $generation;
if ( isset( self::$pdf_indirect_objects ) && isset( self::$pdf_indirect_objects[ $key ] ) ) {
$file_offset = self::$pdf_indirect_objects[ $key ]['start'];
} else { // found object location
$file_offset = 0;
}
$object_starts = array();
$object_content = file_get_contents( $file_name, true, NULL, $file_offset, $chunksize );
//error_log( __LINE__ . " MLAPDF::_find_pdf_indirect_dictionary( {$file_name}, {$file_offset} ) object_content = \r\n" . MLAData::mla_hex_dump( $object_content ), 0 );
/*
* Match the object header
*/
$pattern = sprintf( '!%1$d\\h+%2$d\\h+obj[\\x00-\\x20]*(<<)!', $object, $generation );
//error_log( __LINE__ . " MLAPDF::_find_pdf_indirect_dictionary( {$object}, {$generation} ) pattern = " . var_export( $pattern, true ), 0 );
$match_count = preg_match( $pattern, $object_content, $matches, PREG_OFFSET_CAPTURE );
//error_log( __LINE__ . " MLAPDF::_find_pdf_indirect_dictionary( {$match_count} ) matches = " . var_export( $matches, true ), 0 );
if ( $match_count ) {
$object_starts[] = array( 'offset' => $file_offset, 'start' => $matches[1][1]);
//error_log( __LINE__ . " MLAPDF::_find_pdf_indirect_dictionary( {$file_offset}, {$matches[1][1]} ) object_content = \r\n" . MLAData::mla_hex_dump( substr( $object_content, $matches[1][1] ), 512 ), 0 );
$match_count = 0;
}
/*
* If necessary and possible, advance the $object_content through the file until it contains the start tag
*/
if ( 0 == $match_count && ( $chunksize == strlen( $object_content ) ) ) {
$file_offset += ( $chunksize - 16 );
$object_content = file_get_contents( $file_name, true, NULL, $file_offset, $chunksize );
$match_count = preg_match( $pattern, $object_content, $matches, PREG_OFFSET_CAPTURE );
//error_log( __LINE__ . " MLAPDF::_find_pdf_indirect_dictionary( {$match_count} ) matches = " . var_export( $matches, true ), 0 );
if ( $match_count ) {
$object_starts[] = array( 'offset' => $file_offset, 'start' => $matches[1][1]);
//error_log( __LINE__ . " MLAPDF::_find_pdf_indirect_dictionary( {$file_offset}, {$matches[1][1]} ) object_content = \r\n" . MLAData::mla_hex_dump( substr( $object_content, $matches[1][1] ), 512 ), 0 );
$match_count = 0;
}
while ( 0 == $match_count && ( $chunksize == strlen( $object_content ) ) ) {
$file_offset += ( $chunksize - 16 );
$object_content = file_get_contents( $file_name, true, NULL, $file_offset, $chunksize );
$match_count = preg_match( $pattern, $object_content, $matches, PREG_OFFSET_CAPTURE );
//error_log( __LINE__ . " MLAPDF::_find_pdf_indirect_dictionary( {$match_count} ) matches = " . var_export( $matches, true ), 0 );
if ( $match_count ) {
$object_starts[] = array( 'offset' => $file_offset, 'start' => $matches[1][1]);
//error_log( __LINE__ . " MLAPDF::_find_pdf_indirect_dictionary( {$file_offset}, {$matches[1][1]} ) object_content = \r\n" . MLAData::mla_hex_dump( substr( $object_content, $matches[1][1] ), 512 ), 0 );
$match_count = 0;
}
} // while not found
} // if not found
//error_log( __LINE__ . " MLAPDF::_find_pdf_indirect_dictionary object_starts = " . var_export( $object_starts, true ), 0 );
/*
* Return the highest/latest instance unless a specific instance is requested
*/
$object_count = count( $object_starts );
if ( is_null( $instance ) ) {
$object_start = array_pop( $object_starts );
} else {
$instance = absint( $instance );
$object_start = isset( $object_starts[ $instance ] ) ? $object_starts[ $instance ] : NULL;
}
if ( is_null( $object_start ) ) {
return NULL;
} else {
$file_offset = $object_start['offset'];
$object_content = file_get_contents( $file_name, true, NULL, $file_offset, $chunksize );
$start = $object_start['start'];
}
/*
* If necessary and possible, expand the $object_content until it contains the end tag
*/
$pattern = '!>>[\\x00-\\x20]*[endobj|stream]!';
$match_count = preg_match( $pattern, $object_content, $matches, PREG_OFFSET_CAPTURE, $start );
if ( 0 == $match_count && ( $chunksize == strlen( $object_content ) ) ) {
$file_offset = $file_offset + $start;
$start = 0;
$new_chunksize = $chunksize + $chunksize;
$object_content = file_get_contents( $file_name, true, NULL, $file_offset, $new_chunksize );
$match_count = preg_match( $pattern, $object_content, $matches, PREG_OFFSET_CAPTURE, $start );
while ( 0 == $match_count && ( $new_chunksize == strlen( $object_content ) ) ) {
$new_chunksize = $new_chunksize + $chunksize;
$object_content = file_get_contents( $file_name, true, NULL, $file_offset, $new_chunksize );
$match_count = preg_match( $pattern, $object_content, $matches, PREG_OFFSET_CAPTURE, $start );
} // while not found
} // if not found
if ( 0 == $match_count ) {
return NULL;
}
if ($match_count) {
$results = array( 'count' => $object_count, 'start' => $file_offset + $start, 'length' => ($matches[0][1] + 2) - $start );
$results['content'] = substr( $object_content, $start, $results['length'] );
//error_log( __LINE__ . " MLAPDF::_find_pdf_indirect_dictionary results = " . var_export( $results, true ), 0 );
return $results;
} // found trailer
return NULL;
}
/**
* Parse a PDF Unicode (16-bit Big Endian) object
*
* @since 2.10
*
* @param string PDF string of 16-bit characters
*
* @return string UTF-8 encoded string
*/
private static function _parse_pdf_UTF16BE( &$source_string ) {
$output = '';
for ($index = 2; $index < strlen( $source_string ); ) {
$value = ( ord( $source_string[ $index++ ] ) << 8 ) + ord( $source_string[ $index++ ] );
if ( $value < 0x80 ) {
$output .= chr( $value );
} elseif ( $value < 0x100 ) {
$output .= MLAData::$utf8_chars[ $value - 0x80 ];
} else {
$output .= '.'; // TODO encode the rest
}
}
return $output;
}
/**
* Parse a PDF string object
*
* Returns an array with one dictionary entry. The array also has a '/length' element containing
* the number of bytes occupied by the string in the source string, including the enclosing parentheses.
*
* @since 2.10
*
* @param string data within which the string occurs
* @param integer offset within the source string of the opening '(' character.
*
* @return array ( key => array( 'type' => type, 'value' => value, '/length' => length ) ) for the string
*/
private static function _parse_pdf_string( &$source_string, $offset ) {
if ( '(' != $source_string[ $offset ] ) {
return array( 'type' => 'unknown', 'value' => '', '/length' => 0 );
}
/*
* Brute force, here we come...
*/
$output = '';
$level = 0;
$in_string = true;
$index = $offset + 1;
while ( $in_string ) {
$byte = $source_string[ $index++ ];
if ( '\\' == $byte ) {
switch ( $source_string[ $index ] ) {
case chr( 0x0A ):
if ( chr( 0x0D ) == $source_string[ $index + 1 ] ) {
$index++;
}
break;
case chr( 0x0D ):
if ( chr( 0x0A ) == $source_string[ $index + 1 ] ) {
$index++;
}
break;
case 'n':
$output .= chr( 0x0A );
break;
case 'r':
$output .= chr( 0x0D );
break;
case 't':
$output .= chr( 0x09 );
break;
case 'b':
$output .= chr( 0x08 );
break;
case 'f':
$output .= chr( 0x0C );
break;
default: // could be a 1- to 3-digit octal value
$digit_limit = $index + 3;
$digit_index = $index;
while ( $digit_index < $digit_limit ) {
if ( ! ctype_digit( $source_string[ $digit_index ] ) ) {
break;
} else {
$digit_index++;
}
}
if ( $digit_count = $digit_index - $index ) {
$output .= chr( octdec( substr( $source_string, $index, $digit_count ) ) );
$index += $digit_count - 1;
} else { // accept the character following the backslash
$output .= $source_string[ $index ];
}
} // switch
$index++;
} else { // REVERSE SOLIDUS
if ( '(' == $byte ) {
$level++;
} elseif ( ')' == $byte ) {
if ( 0 == $level-- ) {
$in_string = false;
continue;
}
}
$output .= $byte;
} // just another 8-bit value, but check for balanced parentheses
} // $in_string
return array( 'type' => 'string', 'value' => $output, '/length' => $index - $offset );
}
/**
* Parse a PDF Linearization Parameter Dictionary object
*
* Returns an array of dictionary contents, classified by object type: boolean, numeric, string, hex (string),
* indirect (object), name, array, dictionary, stream, and null.
* The array also has a '/length' element containing the number of bytes occupied by the
* dictionary in the source string, excluding the enclosing delimiters, if passed in.
* @since 2.10
*
* @param string data within which the object occurs, typically the start of a PDF document
* @param integer filesize of the PDF document, for validation purposes, or zero (0) to ignore filesize
*
* @return mixed array of dictionary objects on success, false on failure
*/
private static function _parse_pdf_LPD_dictionary( &$source_string, $filesize ) {
$header = substr( $source_string, 0, 1024 );
$match_count = preg_match( '!obj[\x00-\x20]*<<(/Linearized).*(>>)[\x00-\x20]*endobj!', $header, $matches, PREG_OFFSET_CAPTURE );
if ( $match_count ) {
$LPD = self::_parse_pdf_dictionary( $header, $matches[1][1] );
}
return false;
}
/**
* Parse a PDF dictionary object
*
* Returns an array of dictionary contents, classified by object type: boolean, numeric, string, hex (string),
* indirect (object), name, array, dictionary, stream, and null.
* The array also has a '/length' element containing the number of bytes occupied by the
* dictionary in the source string, excluding the enclosing delimiters.
*
* @since 2.10
*
* @param string data within which the string occurs
* @param integer offset within the source string of the opening '<<' characters or the first content character.
*
* @return array ( '/length' => length, key => array( 'type' => type, 'value' => value ) ) for each dictionary field
*/
private static function _parse_pdf_dictionary( &$source_string, $offset ) {
/*
* Find the end of the dictionary
*/
if ( '<<' == substr( $source_string, $offset, 2 ) ) {
$nest = $offset + 2;
} else {
$nest = $offset;
}
$level = 1;
do {
$dictionary_end = strpos( $source_string, '>>', $nest );
if ( false === $dictionary_end ) {
/* translators: 1: ERROR tag 2: source offset 3: nest level */
MLACore::mla_debug_add( sprintf( _x( '%1$s: _parse_pdf_dictionary offset = %2$d, nest = %3$d.', 'error_log', 'media-library-assistant' ), __( 'ERROR', 'media-library-assistant' ), $offset, $nest ), MLACore::MLA_DEBUG_CATEGORY_ANY );
/* translators: 1: ERROR tag 2: dictionary excerpt */
MLACore::mla_debug_add( sprintf( _x( '%1$s: _parse_pdf_dictionary no end delimiter dump = %2$s.', 'error_log', 'media-library-assistant' ), __( 'ERROR', 'media-library-assistant' ), MLAData::mla_hex_dump( substr( $source_string, $offset, 128 ), 128, 16 ) ), MLACore::MLA_DEBUG_CATEGORY_ANY );
return array( '/length' => 0 );
}
$nest = strpos( $source_string, '<<', $nest );
if ( false === $nest ) {
$nest = $dictionary_end + 2;
$level--;
} elseif ( $nest < $dictionary_end ) {
$nest += 2;
$level++;
} else {
$nest = $dictionary_end + 2;
$level--;
}
} while ( $level );
$dictionary_length = $dictionary_end + 2 - $offset;
$dictionary = array();
// \x00-\x20 for whitespace
// \(|\)|\<|\>|\[|\]|\{|\}|\/|\% for delimiters
$match_count = preg_match_all( '!/([^\x00-\x20|\(|\)|\<|\>|\[|\]|\{|\}|\/|\%]*)([\x00-\x20]*)!', substr( $source_string, $offset, $dictionary_length ), $matches, PREG_OFFSET_CAPTURE );
$end_data = -1;
for ( $match_index = 0; $match_index < $match_count; $match_index++ ) {
$name = $matches[1][ $match_index ][0];
$value_start = $offset + $matches[2][ $match_index ][1] + strlen( $matches[2][ $match_index ][0] );
/*
* Skip over false matches within a string or nested dictionary
*/
if ( $value_start < $end_data ) {
continue;
}
$end_data = -1;
$value_count = preg_match(
'!(\/?[^\/\x0D\x0A]*)!',
substr( $source_string, $value_start, ($dictionary_end - $value_start ) ), $value_matches, PREG_OFFSET_CAPTURE );
if ( 1 == $value_count ) {
$value = trim( $value_matches[0][0] );
$length = strlen( $value );
$dictionary[ $name ]['value'] = $value;
if ( ! isset( $value[0] ) ) {
/* translators: 1: ERROR tag 2: entry name 3: value excerpt */
MLACore::mla_debug_add( sprintf( _x( '%1$s: _parse_pdf_dictionary bad value [ %2$s ] dump = %3$s', 'error_log', 'media-library-assistant' ), __( 'ERROR', 'media-library-assistant' ), $name, MLAData::mla_hex_dump( $value, 32, 16 ) ), MLACore::MLA_DEBUG_CATEGORY_ANY );
continue;
}
if ( in_array( $value, array( 'true', 'false' ) ) ) {
$dictionary[ $name ]['type'] = 'boolean';
} elseif ( is_numeric( $value ) ) {
$dictionary[ $name ]['type'] = 'numeric';
} elseif ( '(' == $value[0] ) {
$dictionary[ $name ] = self::_parse_pdf_string( $source_string, $value_start );
$end_data = $value_start + $dictionary[ $name ]['/length'];
unset( $dictionary[ $name ]['/length'] );
} elseif ( '<' == $value[0] ) {
if ( '<' == $value[1] ) {
$dictionary[ $name ]['value'] = self::_parse_pdf_dictionary( $source_string, $value_start );
$dictionary[ $name ]['type'] = 'dictionary';
$end_data = $value_start + 4 + $dictionary[ $name ]['value']['/length'];
unset( $dictionary[ $name ]['value']['/length'] );
} else {
$dictionary[ $name ]['type'] = 'hex';
}
} elseif ( '/' == $value[0] ) {
$dictionary[ $name ]['value'] = substr( $value, 1 );
$dictionary[ $name ]['type'] = 'name';
$match_index++; // Skip to the next key
} elseif ( '[' == $value[0] ) {
$dictionary[ $name ]['type'] = 'array';
$array_length = strpos( $source_string, ']', $value_start ) - ($value_start + 1);
$dictionary[ $name ]['value'] = substr( $source_string, $value_start + 1, $array_length );
$end_data = 2 + $value_start + $array_length;
} elseif ( 'null' == $value ) {
$dictionary[ $name ]['type'] = 'null';
} elseif ( 'stream' == substr( $value, 0, 6 ) ) {
$dictionary[ $name ]['type'] = 'stream';
} else {
$object_count = preg_match( '!(\d+)\h+(\d+)\h+R!', $value, $object_matches );
if ( 1 == $object_count ) {
$dictionary[ $name ]['type'] = 'indirect';
$dictionary[ $name ]['object'] = $object_matches[1];
$dictionary[ $name ]['generation'] = $object_matches[2];
} else {
$dictionary[ $name ]['type'] = 'unknown';
}
}
} else {
$dictionary[ $matches[1][ $match_index ][0] ] = array( 'value' => '' );
$dictionary[ $matches[1][ $match_index ][0] ]['type'] = 'nomatch';
}
} // foreach match
$dictionary['/length'] = $dictionary_length;
return $dictionary;
}
/**
* Extract dictionary from traditional cross-reference + trailer documents
*
* @since 2.10
*
* @param string full path to the desired file
* @param integer offset within file of the cross-reference table
*
* @return mixed array of "PDF dictionary arrays", newest first, or NULL on failure
*/
private static function _extract_pdf_trailer( $file_name, $file_offset ) {
$chunksize = 16384;
$tail = file_get_contents( $file_name, true, NULL, $file_offset, $chunksize );
$chunk_offset = 0;
/*
* look for traditional xref and trailer
*/
if ( 'xref' == substr( $tail, $chunk_offset, 4 ) ) {
$xref_length = self::_parse_pdf_xref_section( $file_name, $file_offset + $chunk_offset + 4 );
//error_log( __LINE__ . " MLAPDF::_extract_pdf_trailer xref_length = " . var_export( $xref_length, true ), 0 );
$chunk_offset += 4 + $xref_length;
if ( $chunk_offset > ( $chunksize - 1024 ) ) {
$file_offset += $chunk_offset;
$tail = file_get_contents( $file_name, true, NULL, $file_offset, $chunksize );
$chunk_offset = 0;
}
//error_log( __LINE__ . " MLAPDF::_extract_pdf_trailer( {$file_offset} ) tail = \r\n" . MLAData::mla_hex_dump( $tail, 0, 16, 0 ), 0 );
$match_count = preg_match( '/[\x00-\x20]*trailer[\x00-\x20]+/', $tail, $matches, PREG_OFFSET_CAPTURE, $chunk_offset );
//error_log( __LINE__ . " MLAPDF::_extract_pdf_trailer( {$match_count} ) matches = " . var_export( $matches, true ), 0 );
if ( $match_count ) {
$chunk_offset = $matches[0][1] + strlen( $matches[0][0] );
$dictionary = self::_parse_pdf_dictionary( $tail, $chunk_offset );
//error_log( __LINE__ . " MLAPDF::_extract_pdf_trailer dictionary = " . var_export( $dictionary, true ), 0 );
if ( isset( $dictionary['Prev'] ) ) {
$other_trailers = self::_extract_pdf_trailer( $file_name, $dictionary['Prev']['value'] );
} else {
$other_trailers = NULL;
}
if ( is_array( $other_trailers ) ) {
$other_trailers = array_merge( $other_trailers, array( $dictionary ) );
return $other_trailers;
} else {
return array( $dictionary );
}
} // found 'trailer'
} else { // found 'xref'
/*
* Look for a cross-reference stream
*/
$match_count = preg_match( '!(\d+)\\h+(\d+)\\h+obj[\x00-\x20]*!', $tail, $matches, PREG_OFFSET_CAPTURE );
if ( $match_count ) {
$chunk_offset = $matches[0][1] + strlen( $matches[0][0] );
if ( '<<' == substr( $tail, $chunk_offset, 2) ) {
$dictionary = self::_parse_pdf_dictionary( $tail, $chunk_offset );
/*
* Parse the cross-reference stream following the dictionary, if present
*/
if ( isset( $dictionary['Type'] ) && 'XRef' == $dictionary['Type']['value'] ) {
$xref_length = self::_parse_pdf_xref_stream( $file_name, $file_offset + $chunk_offset + (integer) $dictionary['/length'], $dictionary['W']['value'] );
}
if ( isset( $dictionary['Prev'] ) ) {
$other_trailers = self::_extract_pdf_trailer( $file_name, $dictionary['Prev']['value'] );
} else {
$other_trailers = NULL;
}
if ( is_array( $other_trailers ) ) {
$other_trailers = array_merge( array( $dictionary ), $other_trailers );
return $other_trailers;
} else {
return array( $dictionary );
}
} // found cross-reference stream dictionary
} // found cross-reference stream object
}
return NULL;
}
/**
* Extract Metadata from a PDF file
*
* @since 2.10
*
* @param string full path to the desired file
*
* @return array ( 'xmp' => array( key => value ), 'pdf' => array( key => value ) ) for each metadata field, in string format
*/
public static function mla_extract_pdf_metadata( $file_name ) {
$xmp = array();
$metadata = array();
self::$pdf_indirect_objects = NULL;
$chunksize = 16384;
if ( ! file_exists( $file_name ) ) {
return array( 'xmp' => $xmp, 'pdf' => $metadata );
}
$filesize = filesize( $file_name );
$file_offset = ( $chunksize < $filesize ) ? ( $filesize - $chunksize ) : 0;
$tail = file_get_contents( $file_name, false, NULL, $file_offset );
//error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata( {$file_name}, {$file_offset} ) tail = \r\n" . MLAData::mla_hex_dump( $tail ), 0 );
if ( 0 == $file_offset ) {
$header = substr( $tail, 0, 128 );
} else {
$header = file_get_contents( $file_name, false, NULL, 0, 128 );
}
//error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata( {$file_name}, {$file_offset} ) header = \r\n" . MLAData::mla_hex_dump( $header ), 0 );
if ( '%PDF-' == substr( $header, 0, 5 ) ) {
$metadata['PDF_Version'] = substr( $header, 1, 7 );
$metadata['PDF_VersionNumber'] = substr( $header, 5, 3 );
}
/*
* Find the xref and (optional) trailer
*/
$match_count = preg_match_all( '/startxref[\x00-\x20]+(\d+)[\x00-\x20]+\%\%EOF/', $tail, $matches, PREG_OFFSET_CAPTURE );
if ( 0 == $match_count ) {
/* translators: 1: ERROR tag 2: path and file */
MLACore::mla_debug_add( sprintf( _x( '%1$s: File "%2$s", startxref not found.', 'error_log', 'media-library-assistant' ), __( 'ERROR', 'media-library-assistant' ), $path ), MLACore::MLA_DEBUG_CATEGORY_ANY );
return array( 'xmp' => $xmp, 'pdf' => $metadata );
}
$startxref = (integer) $matches[1][ $match_count - 1 ][0];
$trailer_dictionaries = self::_extract_pdf_trailer( $file_name, $startxref );
//error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata trailer_dictionaries = " . var_export( $trailer_dictionaries, true ), 0 );
if ( is_array( $trailer_dictionaries ) ) {
$info_reference = NULL;
foreach ( $trailer_dictionaries as $trailer_dictionary ) {
if ( isset( $trailer_dictionary['Info'] ) ) {
$info_reference = $trailer_dictionary['Info'];
break;
}
}
//error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata info_reference = " . var_export( $info_reference, true ), 0 );
if ( isset( $info_reference ) ) {
$info_object = self::_find_pdf_indirect_dictionary( $file_name, $info_reference['object'], $info_reference['generation'] );
/*
* Handle single or multiple Info instances
*/
$info_objects = array();
if ( $info_object ) {
if ( 1 == $info_object['count'] ) {
$info_objects[] = $info_object;
} else {
for ( $index = 0; $index < $info_object['count']; $index++ ) {
$info_objects[] = self::_find_pdf_indirect_dictionary( $file_name, $info_reference['object'], $info_reference['generation'], $index );
}
}
}
//error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata info_objects = " . var_export( $info_objects, true ), 0 );
foreach( $info_objects as $info_object ) {
$info_dictionary = self::_parse_pdf_dictionary( $info_object['content'], 0 );
//error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata info_dictionary = " . var_export( $info_dictionary, true ), 0 );
unset( $info_dictionary['/length'] );
foreach ( $info_dictionary as $name => $value ) {
if ( 'string' == $value['type'] ) {
$prefix = substr( $value['value'], 0, 2 );
if ( 'D:' == $prefix ) {
$metadata[ $name ] = MLAData::mla_parse_pdf_date( $value['value'] );
} elseif ( ( chr(0xFE) . chr(0xFF) ) == $prefix ) {
$metadata[ $name ] = self::_parse_pdf_UTF16BE( $value['value'] );
} else {
$metadata[ $name ] = $value['value'];
}
} else {
$metadata[ $name ] = $value['value'];
}
} // each info entry
} // foreach Info object
/*
* Remove spurious "Filter" dictionaries
*/
unset( $metadata['Filter'] );
unset( $metadata['Length'] );
unset( $metadata['Length1'] );
} // found Info reference
//error_log( __LINE__ . ' MLAPDF::mla_extract_pdf_metadata pdf metadata = ' . var_export( $metadata, true ), 0 );
/*
* Look for XMP Metadata
*/
$root_reference = NULL;
//error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata info_dictionary = " . var_export( $info_dictionary, true ), 0 );
foreach ( $trailer_dictionaries as $trailer_dictionary ) {
if ( isset( $trailer_dictionary['Root'] ) ) {
$root_reference = $trailer_dictionary['Root'];
break;
}
}
//error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata root_reference = " . var_export( $root_reference, true ), 0 );
if ( isset( $root_reference ) ) {
$root_object = self::_find_pdf_indirect_dictionary( $file_name, $root_reference['object'], $root_reference['generation'] );
//error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata root_object = " . var_export( $root_object, true ), 0 );
if ( $root_object ) {
$root_dictionary = self::_parse_pdf_dictionary( $root_object['content'], 0 );
//error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata root_dictionary = " . var_export( $root_dictionary, true ), 0 );
unset( $root_dictionary['/length'] );
if ( isset( $root_dictionary['Metadata'] ) ) {
$xmp_object = self::_find_pdf_indirect_dictionary( $file_name, $root_dictionary['Metadata']['object'], $root_dictionary['Metadata']['generation'] );
//error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata xmp_object = " . var_export( $xmp_object, true ), 0 );
$xmp = MLAData::mla_parse_xmp_metadata( $file_name, $xmp_object['start'] + $xmp_object['length'] );
if ( is_array( $xmp ) ) {
$metadata = array_merge( $metadata, $xmp );
} else {
$xmp = array();
$xmp = MLAData::mla_parse_xmp_metadata( $file_name, 0 );
//error_log( __LINE__ . ' MLAPDF::mla_extract_pdf_metadata recovered xmp = ' . var_export( $xmp, true ), 0 );
}
} // found Metadata reference
} // found Root object
} // found Root reference
} // found trailer_dictionaries
//error_log( __LINE__ . ' MLAPDF::mla_extract_pdf_metadata pdf = ' . var_export( $metadata, true ), 0 );
//error_log( __LINE__ . ' MLAPDF::mla_extract_pdf_metadata xmp = ' . var_export( $xmp, true ), 0 );
return array( 'xmp' => $xmp, 'pdf' => $metadata );
}
} // class MLAPDF
?>