python matching with ngrams
# https://pythonprogrammingsnippets.com def get_ngrams(text, n): # split text into n-grams. ngrams = [] for i in range(len(text)-n+1): ngrams.append(text[i:i+n]) return ngrams def compare_strings_ngram_pct(string1, string2, n): # compare two strings based on the percentage of matching n-grams # Split strings into n-grams string1_ngrams = get_ngrams(string1, n) string2_ngrams = get_ngrams(string2, n) # Find the number of matching n-grams matching_ngrams = set(string1_ngrams) & set(string2_ngrams) # Calculate the percentage match percentage_match = (len(matching_ngrams) / len(string1_ngrams)) * 100 return percentage_match def compare_strings_ngram_max_size(string1, string2): # compare two strings based on the maximum matching n-gram size # Split strings into n-grams of varying lengths n = min(len(string1), len(string2)) for i in range(n, 0, -1): string1_ngrams = set(get_ngrams(string1, i)) string2_ngrams = set(get_ngrams(string2, i)) # Find the number of matching n-grams matching_ngrams = string1_ngrams & string2_ngrams if len(matching_ngrams) > 0: # Return the maximum matching n-gram size and break out of the loop return i # If no matching n-grams are found, return 0 return 0 string1 = "hello world" string2 = "hello there" n = 2 # n-gram size # find how much of string 2 matches string 1 based on n-grams percentage_match = compare_strings_ngram_pct(string1, string2, n) print(f"The percentage match is: {percentage_match}%") # find maximum ngram size of matching ngrams max_match_size = compare_strings_ngram_max_size(string1, string2) print(f"The maximum matching n-gram size is: {max_match_size}")












