Source code for text_data.query

"""This builds and runs search queries for :class:`text_data.index.Corpus`.

For the most part, you won't be using this directly. Instead, you'll likely
be using :class:`text_data.index.Corpus`. However, viewing the :code:`__repr__`
for the query you're running can be helpful for debugging or validating
queries.
"""
import collections
import re
from typing import Callable, List

QUERY_TERMS = {"AND", "OR", "NOT"}
#: This represents an set of words you want to search for.
#:
#: Each query item has attached to it a set of words,
#: an identifier stating whether the query terms are part of
#: an exact phrase (i.e. whether the order matters)
#: and what kind of query (a boolean AND query, a boolean OR query, or a boolean NOT query),
#: is being performed on the query.
#:
#: Args:
#:      words (List[str]): A list of words representing all of the words that will be searched for.
#:      exact (bool): Whether the search terms are part of an exact phrase match
#:      modifier (str): The boolean query (AND, OR, or NOT)
QueryItem = collections.namedtuple("QueryItem", "words exact modifier")


[docs]class Query: r"""Represents a query. This is used internaly by :class:`text_data.index.Corpus` to handle searching. The basic formula for writing queries should be familiar; all of the queries are simple boolean phrases. But here are more complete specifications: In order to search for places where two words appeared, you simply need to type the two words:: Query("i am") Searches using this query will look for documents where the words "i" and "am" both appeared. To have them look for places where either word appeared, use an "OR" query:: Query("i OR am") Alternatively, you can look for documents where one word occurred but the other didn't using a NOT query:: Query("i NOT am") To search for places where the phrase "i am" appeared, use quotes:: Query("'i am'") You can use AND queries to limit the results of previous sets of queries. For instance:: Query("i OR am AND you") will find places where "you" and *either* "I" or "am" appeared. In order to search for the literal words 'AND', 'OR', or 'NOT', you must encapsulate them in quotes:: Query("'AND'") Finally, you may customize the way your queries are parsed by passing a tokenizer. By default, :code:`Query` identifies strings of text that it needs to split and uses :code:`str.split` to split the strings. But you can change how to split the text, which can be helpful/necessary if the words you're searching for have spaces in them. For instance, this will split the words you're querying by spaces, unless the words are 'united states':: >>> import re >>> us_phrase = re.compile(r"(united states|\S+)") >>> Query("he is from the united states", query_tokenizer=us_phrase.findall) <Query ([[QueryItem(words=['he', 'is', 'from', 'the', 'united states'], exact=False, modifier='OR')]])> Args: query_string: The human-readable query query_tokenizer: A function to tokenize phrases in the query (Defaults to string.split). **Note:** This specifically tokenizes individual phrases in the query. As a result, the function does not need to handle quotations. """ def __init__( self, query_string: str, query_tokenizer: Callable[[str], List[str]] = str.split, ): # starting with a key word should raise an error if ( re.search(fr"^\s*({'|'.join(QUERY_TERMS)})(?:\s+|$)", query_string) is not None ): raise ValueError("You cannot use a keyword at the beginning of the query") # this holds outputs of queries, as set objects self.queries = [] self.raw_query = query_string current_idx = 0 # set the first last_modifier = "OR" term_regex = re.compile(fr"\s+({'|'.join(QUERY_TERMS)})\s+") for term in term_regex.finditer(query_string): query_items = query_string[current_idx : term.start()].strip() self.queries.append( self._parse_subquery(query_items, last_modifier, query_tokenizer) ) last_modifier = term.group(1) current_idx = term.end() end_query = query_string[current_idx:].strip() self.queries.append( self._parse_subquery(end_query, last_modifier, query_tokenizer) ) def _parse_subquery( self, query: str, last_modifier: str, query_tokenizer: Callable[[str], List[str]] = str.split, ) -> List[QueryItem]: """This parses queries between QUERY_TERM objects. Internal to init. Args: query: The subquery last_modifier: Specifies the last query term that was used (or OR if none) query_tokenizer: Passed directly from __init__ """ matches = [] current_idx = 0 quote_regex = re.compile( r"(?:\s|^)(\'(?P<single>[^\']+)\'|\"(?P<double>[^\"]+)\")(?:\s|$)" ) for exact_match in quote_regex.finditer(query): single_quote = exact_match.group("single") double_quote = exact_match.group("double") quoted_matl = single_quote if single_quote is not None else double_quote pre_match = query[current_idx : exact_match.start()].strip() if pre_match != "": matches.append( QueryItem(query_tokenizer(pre_match), False, last_modifier) ) matches.append(QueryItem(query_tokenizer(quoted_matl), True, last_modifier)) current_idx = exact_match.end() post_match = query[current_idx:].strip() if post_match != "": matches.append(QueryItem(query_tokenizer(post_match), False, last_modifier)) return matches def __repr__(self): return f"<Query ({self.queries})>" def __str__(self): return self.raw_query