wordpress-preseed/wp-content/plugins/jetpack/_inc/lib/jetpack-wpes-query-builder/jetpack-wpes-query-parser.php

<?php

/**
 * Parse a pure text query into WordPress Elasticsearch query. This builds on
 * the Jetpack_WPES_Query_Builder() to provide search query parsing.
 *
 * The key part of this parser is taking a user's query string typed into a box
 * and converting it into an ES search query.
 *
 * This varies by application, but roughly it means extracting some parts of the query
 * (authors, tags, and phrases) that are treated as a filter. Then taking the
 * remaining words and building the correct query (possibly with prefix searching
 * if we are doing search as you type)
 *
 * This class only supports ES 2.x+
 *
 * This parser builds queries of the form:
 *   bool:
 *     must:
 *       AND match of a single field (ideally an edgengram field)
 *     filter:
 *       filter clauses from context (eg @gibrown, #news, etc)
 *     should:
 *       boosting of results by various fields
 *
 * Features supported:
 *  - search as you type
 *  - phrases
 *  - supports querying across multiple languages at once
 *
 * Example usage (from Search on Reader Manage):
 *
 *		require_lib( 'jetpack-wpes-query-builder/jetpack-wpes-search-query-parser' );
 *		$parser = new WPES_Search_Query_Parser( $args['q'], array( $lang ) );
 *
 *		//author
 *		$parser->author_field_filter( array(
 *			'prefixes' => array( '@' ),
 *			'wpcom_id_field' => 'author_id',
 *			'must_query_fields' => array( 'author.engram', 'author_login.engram' ),
 *			'boost_query_fields' => array( 'author^2', 'author_login^2', 'title.default.engram' ),
 *		) );
 *
 *		//remainder of query
 *		$match_content_fields = $parser->merge_ml_fields(
 *			array(
 *				'all_content' => 0.1,
 *			),
 *			array(
 *				'all_content.default.engram^0.1',
 *			)
 *		);
 *		$boost_content_fields = $parser->merge_ml_fields(
 *			array(
 *				'title' => 2,
 *				'description' => 1,
 *				'tags' => 1,
 *			),
 *			array(
 *				'author_login^2',
 *				'author^2',
 *			)
 *		);
 *
 *		$parser->phrase_filter( array(
 *			'must_query_fields' => $match_content_fields,
 *			'boost_query_fields' => $boost_content_fields,
 *		) );
 *		$parser->remaining_query( array(
 *			'must_query_fields' => $match_content_fields,
 *			'boost_query_fields' => $boost_content_fields,
 *		) );
 *
 *		//Boost on phrases
 *		$parser->remaining_query( array(
 *			'boost_query_fields' => $boost_content_fields,
 *			'boost_query_type'   => 'phrase',
 *		) );
 *
 *		//boosting
 *		$parser->add_max_boost_to_functions( 20 );
 *		$parser->add_function( 'field_value_factor', array(
 *			'follower_count' => array(
 *				'modifier' => 'sqrt',
 *				'factor' => 1,
 *				'missing' => 0,
 *			) ) );
 *
 *		//Filtering
 *		$parser->add_filter( array(
 *			'exists' => array( 'field' => 'langs.' . $lang )
 *		) );
 *
 *		//run the query
 *		$es_query_args = array(
 *			'name' => 'feeds',
 *			'blog_id' => false,
 *			'security_strategy' => 'a8c',
 *			'type' => 'feed,blog',
 *			'fields' => array( 'blog_id', 'feed_id' ),
 *			'query' => $parser->build_query(),
 *			'filter' => $parser->build_filter(),
 *			'size' => $size,
 *			'from' => $from
 *		);
 *		$es_results = es_api_search_index( $es_query_args, 'api-feed-find' );
 *
 */

jetpack_require_lib( 'jetpack-wpes-query-builder' );

class Jetpack_WPES_Search_Query_Parser extends Jetpack_WPES_Query_Builder {

	protected $orig_query = '';
	protected $current_query = '';
	protected $langs;
	protected $avail_langs = array( 'ar', 'bg', 'ca', 'cs', 'da', 'de', 'el', 'en', 'es', 'eu', 'fa', 'fi', 'fr', 'he', 'hi', 'hu', 'hy', 'id', 'it', 'ja', 'ko', 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr', 'zh' );

	public function __construct( $user_query, $langs ) {
		$this->orig_query = $user_query;
		$this->current_query = $this->orig_query;
		$this->langs = $this->norm_langs( $langs );
	}

	protected $extracted_phrases = array();

	///////////////////////////////////////////////////////
	// Methods for Building arrays of multilingual fields

	/*
	 * Normalize language codes
	 */
	public function norm_langs( $langs ) {
		$lst = array();
		foreach( $langs as $l ) {
			$l = strtok( $l, '-_' );
			if ( in_array( $l, $this->avail_langs ) ) {
				$lst[$l] = true;
			} else {
				$lst['default'] = true;
			}
		}
		return array_keys( $lst );
	}

	/*
	 * Take a list of field prefixes and expand them for multi-lingual
	 * with the provided boostings.
	 */
	public function merge_ml_fields( $fields2boosts, $additional_fields ) {
		$flds = array();
		foreach( $fields2boosts as $f => $b ) {
			foreach( $this->langs as $l ) {
				$flds[] = $f . '.' . $l . '^' . $b;
			}
		}
		foreach( $additional_fields as $f ) {
			$flds[] = $f;
		}
		return $flds;
	}

	////////////////////////////////////
	// Extract Fields for Filtering on

	/*
	 * Extract any @mentions from the user query
	 *  use them as a filter if we can find a wp.com id
	 *  otherwise use them as a
	 *
	 *  args:
	 *    wpcom_id_field: wp.com id field
	 *    must_query_fields: array of fields to search for matching results (optional)
	 *    boost_query_fields: array of fields to search in for boosting results (optional)
	 *    prefixes: array of prefixes that the user can use to indicate an author
	 *
	 *  returns true/false of whether any were found
	 *
	 * See also: https://github.com/twitter/twitter-text/blob/master/java/src/com/twitter/Regex.java
	 */
	public function author_field_filter( $args ) {
		$defaults = array(
			'wpcom_id_field' => 'author_id',
			'must_query_fields' => null,
			'boost_query_fields' => null,
			'prefixes' => array( '@' ),
		);
		$args = wp_parse_args( $args, $defaults );

		$names = array();
		foreach( $args['prefixes'] as $p ) {
			$found = $this->get_fields( $p );
			if ( $found ) {
				foreach( $found as $f ) {
					$names[] = $f;
				}
			}
		}

		if ( empty( $names ) ) {
			return false;
		}

		foreach( $args['prefixes'] as $p ) {
			$this->remove_fields( $p );
		}

		$user_ids = array();
		$query_names = array();

		//loop through the matches and separate into filters and queries
		foreach( $names as $n ) {
			//check for exact match on login
			$userdata = get_user_by( 'login', strtolower( $n ) );
			$filtering = false;
			if ( $userdata ) {
				$user_ids[ $userdata->ID ] = true;
				$filtering = true;
			}

			$is_phrase = false;
			if ( preg_match( '/"/', $n ) ) {
				$is_phrase = true;
				$n = preg_replace( '/"/', '', $n );
			}

			if ( !empty( $args['must_query_fields'] ) && !$filtering ) {
				if ( $is_phrase ) {
					$this->add_query( array(
						'multi_match' => array(
							'fields' => $args['must_query_fields'],
							'query' => $n,
							'type' => 'phrase',
					) ) );
				} else {
					$this->add_query( array(
						'multi_match' => array(
							'fields' => $args['must_query_fields'],
							'query' => $n,
					) ) );
				}
			}

			if ( !empty( $args['boost_query_fields'] ) ) {
				if ( $is_phrase ) {
					$this->add_query( array(
						'multi_match' => array(
							'fields' => $args['boost_query_fields'],
							'query' => $n,
							'type' => 'phrase',
					) ), 'should' );
				} else {
					$this->add_query( array(
						'multi_match' => array(
							'fields' => $args['boost_query_fields'],
							'query' => $n,
					) ), 'should' );
				}
			}
		}

		if ( ! empty( $user_ids ) ) {
			$user_ids = array_keys( $user_ids );
			$this->add_filter( array( 'terms' => array( $args['wpcom_id_field'] => $user_ids ) ) );
		}

		return true;
	}

	/*
	 * Extract any prefix followed by text use them as a must clause,
	 *   and optionally as a boost to the should query
	 *   This can be used for hashtags. eg #News, or #"current events",
	 *   but also works for any arbitrary field. eg from:Greg
	 *
	 *  args:
	 *    must_query_fields: array of fields that must match the tag (optional)
	 *    boost_query_fields: array of fields to boost search on (optional)
	 *    prefixes: array of prefixes that the user can use to indicate a tag
	 *
	 *  returns true/false of whether any were found
	 *
	 */
	public function text_field_filter( $args ) {
		$defaults = array(
			'must_query_fields' => array( 'tag.name' ),
			'boost_query_fields' => array( 'tag.name' ),
			'prefixes' => array( '#' ),
		);
		$args = wp_parse_args( $args, $defaults );

		$tags = array();
		foreach( $args['prefixes'] as $p ) {
			$found = $this->get_fields( $p );
			if ( $found ) {
				foreach( $found as $f ) {
					$tags[] = $f;
				}
			}
		}

		if ( empty( $tags ) ) {
			return false;
		}

		foreach( $args['prefixes'] as $p ) {
			$this->remove_fields( $p );
		}

		foreach( $tags as $t ) {
			$is_phrase = false;
			if ( preg_match( '/"/', $t ) ) {
				$is_phrase = true;
				$t = preg_replace( '/"/', '', $t );
			}

			if ( ! empty( $args['must_query_fields'] ) ) {
				if ( $is_phrase ) {
					$this->add_query( array(
						'multi_match' => array(
							'fields' => $args['must_query_fields'],
							'query' => $t,
							'type' => 'phrase',
					) ) );
				} else {
					$this->add_query( array(
						'multi_match' => array(
							'fields' => $args['must_query_fields'],
							'query' => $t,
					) ) );
				}
			}

			if ( ! empty( $args['boost_query_fields'] ) ) {
				if ( $is_phrase ) {
					$this->add_query( array(
						'multi_match' => array(
							'fields' => $args['boost_query_fields'],
							'query' => $t,
							'type' => 'phrase',
					) ), 'should' );
				} else {
					$this->add_query( array(
						'multi_match' => array(
							'fields' => $args['boost_query_fields'],
							'query' => $t,
					) ), 'should' );
				}
			}
		}

		return true;
	}

	/*
	 * Extract anything surrounded by quotes or if there is an opening quote
	 *   that is not complete, and add them to the query as a phrase query.
	 *   Quotes can be either '' or ""
	 *
	 *  args:
	 *    must_query_fields: array of fields that must match the phrases
	 *    boost_query_fields: array of fields to boost the phrases on (optional)
	 *
	 *  returns true/false of whether any were found
	 *
	 */
	public function phrase_filter( $args ) {
		$defaults = array(
			'must_query_fields' => array( 'all_content' ),
			'boost_query_fields' => array( 'title' ),
		);
		$args = wp_parse_args( $args, $defaults );

		$phrases = array();
		if ( preg_match_all( '/"([^"]+)"/', $this->current_query, $matches ) ) {
			foreach ( $matches[1] as $match ) {
				$phrases[] = $match;
			}
			$this->current_query = preg_replace( '/"([^"]+)"/', '', $this->current_query );
		}

		if ( preg_match_all( "/'([^']+)'/", $this->current_query, $matches ) ) {
			foreach ( $matches[1] as $match ) {
				$phrases[] = $match;
			}
			$this->current_query = preg_replace( "/'([^']+)'/", '', $this->current_query );
		}

		//look for a final, uncompleted phrase
		$phrase_prefix = false;
		if ( preg_match_all( '/"([^"]+)$/', $this->current_query, $matches ) ) {
			$phrase_prefix = $matches[1][0];
			$this->current_query = preg_replace( '/"([^"]+)$/', '', $this->current_query );
		}
		if ( preg_match_all( "/(?:'\B|\B')([^']+)$/", $this->current_query, $matches ) ) {
			$phrase_prefix = $matches[1][0];
			$this->current_query = preg_replace( "/(?:'\B|\B')([^']+)$/", '', $this->current_query );
		}

		if ( $phrase_prefix ) {
			$phrases[] = $phrase_prefix;
		}
		if ( empty( $phrases ) ) {
			return false;
		}

		foreach ( $phrases as $p ) {
			$this->add_query( array(
				'multi_match' => array(
					'fields' => $args['must_query_fields'],
					'query' => $p,
					'type' => 'phrase',
				) ) );

			if ( ! empty( $args['boost_query_fields'] ) ) {
				$this->add_query( array(
					'multi_match' => array(
						'fields' => $args['boost_query_fields'],
						'query' => $p,
						'operator' => 'and',
				) ), 'should' );
			}
		}

		return true;
	}

	/*
	 * Query fields based on the remaining parts of the query
	 *   This could be the final AND part of the query terms to match, or it
	 *   could be boosting certain elements of the query
	 *
	 *  args:
	 *    must_query_fields: array of fields that must match the remaining terms (optional)
	 *    boost_query_fields: array of fields to boost the remaining terms on (optional)
	 *
	 */
	public function remaining_query( $args ) {
		$defaults = array(
			'must_query_fields' => null,
			'boost_query_fields' => null,
			'boost_operator' => 'and',
			'boost_query_type' => 'best_fields',
		);
		$args = wp_parse_args( $args, $defaults );

		if ( empty( $this->current_query ) || ctype_space( $this->current_query ) ) {
			return;
		}

		if ( ! empty( $args['must_query_fields'] ) ) {
			$this->add_query( array(
				'multi_match' => array(
					'fields' => $args['must_query_fields'],
					'query' => $this->current_query,
					'operator' => 'and',
			) ) );
		}

		if ( ! empty( $args['boost_query_fields'] ) ) {
			$this->add_query( array(
				'multi_match' => array(
					'fields' => $args['boost_query_fields'],
					'query' => $this->current_query,
					'operator' => $args['boost_operator'],
					'type' => $args['boost_query_type'],
			) ), 'should' );
		}

	}

	/*
	 * Query fields using a prefix query (alphabetical expansions on the index).
	 *   This is not recommended. Slower performance and worse relevancy.
	 *
	 *  (UNTESTED! Copied from old prefix expansion code)
	 *
	 *  args:
	 *    must_query_fields: array of fields that must match the remaining terms (optional)
	 *    boost_query_fields: array of fields to boost the remaining terms on (optional)
	 *
	 */
	public function remaining_prefix_query( $args ) {
		$defaults = array(
			'must_query_fields' => array( 'all_content' ),
			'boost_query_fields' => array( 'title' ),
			'boost_operator' => 'and',
			'boost_query_type' => 'best_fields',
		);
		$args = wp_parse_args( $args, $defaults );

		if ( empty( $this->current_query ) || ctype_space( $this->current_query ) ) {
			return;
		}

		//////////////////////////////////
		// Example cases to think about:
		// "elasticse"
		// "elasticsearch"
		// "elasticsearch "
		// "elasticsearch lucen"
		// "elasticsearch lucene"
		// "the future"  - note the stopword which will match nothing!
		// "F1" - an exact match that also has tons of expansions
		// "こんにちは" ja "hello"
		// "こんにちは友人" ja "hello friend" - we just rely on the prefix phrase and ES to split words
		//   - this could still be better I bet. Maybe we need to analyze with ES first?
		//

		/////////////////////////////
		//extract pieces of query
		// eg: "PREFIXREMAINDER PREFIXWORD"
		//     "elasticsearch lucen"

		$prefix_word = false;
		$prefix_remainder = false;
		if ( preg_match_all( '/([^ ]+)$/', $this->current_query, $matches ) ) {
			$prefix_word = $matches[1][0];
		}

		$prefix_remainder = preg_replace( '/([^ ]+)$/', '', $this->current_query );
		if ( ctype_space( $prefix_remainder ) ) {
			$prefix_remainder = false;
		}

		if ( ! $prefix_word ) {
			//Space at the end of the query, so skip using a prefix query
			if ( ! empty( $args['must_query_fields'] ) ) {
				$this->add_query( array(
					'multi_match' => array(
						'fields' => $args['must_query_fields'],
						'query' => $this->current_query,
						'operator' => 'and',
					) ) );
			}

			if ( ! empty( $args['boost_query_fields'] ) ) {
				$this->add_query( array(
					'multi_match' => array(
						'fields' => $args['boost_query_fields'],
						'query' => $this->current_query,
						'operator' => $args['boost_operator'],
						'type' => $args['boost_query_type'],
					) ), 'should' );
			}
		} else {

			//must match the prefix word and the prefix remainder
			if ( ! empty( $args['must_query_fields'] ) ) {
				//need to do an OR across a few fields to handle all cases
				$must_q = array( 'bool' => array( 'should' => array( ), 'minimum_should_match' => 1 ) );

				//treat all words as an exact search (boosts complete word like "news"
				//from prefixes of "newspaper")
				$must_q['bool']['should'][] = array( 'multi_match' => array(
					'fields' => $this->all_fields,
					'query' => $full_text,
					'operator' => 'and',
					'type' => 'cross_fields',
				) );

				//always optimistically try and match the full text as a phrase
				//prefix "the futu" should try to match "the future"
				//otherwise the first stopword kinda breaks
				//This also works as the prefix match for a single word "elasticsea"
				$must_q['bool']['should'][] = array( 'multi_match' => array(
					'fields' => $this->phrase_fields,
					'query' => $full_text,
					'operator' => 'and',
					'type' => 'phrase_prefix',
					'max_expansions' => 100,
				) );

				if ( $prefix_remainder ) {
					//Multiple words found, so treat each word on its own and not just as
					//a part of a phrase
					//"elasticsearch lucen" => "elasticsearch" exact AND "lucen" prefix
					$q['bool']['should'][] = array( 'bool' => array(
						'must' => array(
							array( 'multi_match' => array(
								'fields' => $this->phrase_fields,
								'query' => $prefix_word,
								'operator' => 'and',
								'type' => 'phrase_prefix',
								'max_expansions' => 100,
							) ),
							array( 'multi_match' => array(
								'fields' => $this->all_fields,
								'query' => $prefix_remainder,
								'operator' => 'and',
								'type' => 'cross_fields',
							) ),
						)
					) );
				}

				$this->add_query( $must_q );
			}

			//Now add any boosting of the query
			if ( ! empty( $args['boost_query_fields'] ) ) {
				//treat all words as an exact search (boosts complete word like "news"
				//from prefixes of "newspaper")
				$this->add_query( array(
					'multi_match' => array(
						'fields' => $args['boost_query_fields'],
						'query' => $this->current_query,
						'operator' => $args['boost_query_operator'],
						'type' => $args['boost_query_type'],
					) ), 'should' );

				//optimistically boost the full phrase prefix match
				$this->add_query( array(
					'multi_match' => array(
						'fields' => $args['boost_query_fields'],
						'query' => $this->current_query,
						'operator' => 'and',
						'type' => 'phrase_prefix',
						'max_expansions' => 100,
					) ) );
			}
		}
	}

	/*
	 * Boost results based on the lang probability overlaps
	 *
	 *  args:
	 *    langs2prob: list of languages to search in with associated boosts
	 */
	public function boost_lang_probs( $langs2prob ) {
		foreach( $langs2prob as $l => $p ) {
			$this->add_function( 'field_value_factor', array(
				'modifier' => 'none',
				'factor' => $p,
				'missing' => 0.01, //1% chance doc did not have right lang detected
			) );
		}
	}

	////////////////////////////////////
	// Helper Methods

	//Get the text after some prefix. eg @gibrown, or @"Greg Brown"
	protected function get_fields( $field_prefix ) {
		$regex = '/' . $field_prefix . '(("[^"]+")|([^\\p{Z}]+))/';
		if ( preg_match_all( $regex, $this->current_query, $match ) ) {
			return $match[1];
		}
		return false;
	}

	//Remove the prefix and text from the query
	protected function remove_fields( $field_name ) {
		$regex = '/' . $field_name . '(("[^"]+")|([^\\p{Z}]+))/';
		$this->current_query = preg_replace( $regex, '', $this->current_query );
	}

	//Best effort string truncation that splits on word breaks
	protected function truncate_string( $string, $limit, $break=" " ) {
		if ( mb_strwidth( $string ) <= $limit ) {
			return $string;
		}

		// walk backwards from $limit to find first break
		$breakpoint = $limit;
		$broken = false;
		while ( $breakpoint > 0 ) {
			if ( $break === mb_strimwidth( $string, $breakpoint, 1 ) ) {
				$string = mb_strimwidth( $string, 0, $breakpoint );
				$broken = true;
				break;
			}
			$breakpoint--;
		}
		// if we weren't able to find a break, need to chop mid-word
		if ( !$broken ) {
			$string = mb_strimwidth( $string, 0, $limit );
		}
		return $string;
	}

}