Source code for textattack.transformations.word_swaps.word_swap_change_location

"""
Word Swap by Changing Location
-------------------------------
"""

from collections import defaultdict

import more_itertools as mit
import numpy as np

from textattack.shared.data import NAMED_ENTITIES

from .word_swap import WordSwap


[docs]def idx_to_words(ls, words): """Given a list generated from cluster_idx, return a list that contains sub-list (the first element being the idx, and the second element being the words corresponding to the idx)""" output = [] for sub_ls in ls: word = words[sub_ls[0]] for idx in sub_ls[1:]: word = " ".join([word, words[idx]]) output.append([sub_ls, word]) return output
[docs]class WordSwapChangeLocation(WordSwap): def __init__( self, n=3, confidence_score=0.7, language="en", consistent=False, **kwargs ): """Transformation that changes recognized locations of a sentence to another location that is given in the location map. :param n: Number of new locations to generate :param confidence_score: Location will only be changed if it's above the confidence score :param consistent: Whether to change all instances of the same location to the same new location >>> from textattack.transformations import WordSwapChangeLocation >>> from textattack.augmentation import Augmenter >>> transformation = WordSwapChangeLocation() >>> augmenter = Augmenter(transformation=transformation) >>> s = 'I am in Dallas.' >>> augmenter.augment(s) """ super().__init__(**kwargs) self.n = n self.confidence_score = confidence_score self.language = language self.consistent = consistent def _get_transformations(self, current_text, indices_to_modify): words = current_text.words location_idx = [] if self.language == "en": model_name = "ner" elif self.language == "fra" or self.language == "french": model_name = "flair/ner-french" else: model_name = "flair/ner-multi-fast" for i in indices_to_modify: tag = current_text.ner_of_word_index(i, model_name) if "LOC" in tag.value and tag.score > self.confidence_score: location_idx.append(i) # Combine location idx and words to a list ([0] is idx, [1] is location name) # For example, [1,2] to [ [1,2] , ["New York"] ] location_idx = [list(group) for group in mit.consecutive_groups(location_idx)] location_words = idx_to_words(location_idx, words) if self.consistent: location_to_indices = self._build_location_to_indicies_map( location_words, current_text ) transformed_texts = [] for location in location_words: idx = location[0] word = self._capitalize(location[1]) # If doing consistent replacements, only replace the # word if it hasn't been replaced in a previous iteration if self.consistent and word not in location_to_indices: continue replacement_words = self._get_new_location(word) for r in replacement_words: if r == word: continue if self.consistent: indices_to_delete = [] if len(idx) > 1: for i in location_to_indices[word]: for j in range(1, len(idx)): indices_to_delete.append(i + j) transformed_texts.append( current_text.replace_words_at_indices( location_to_indices[word] + indices_to_delete, ([r] * len(location_to_indices[word])) + ([""] * len(indices_to_delete)), ) ) else: # If the original location is more than a single word, keep only the starting word # and replace the starting word with the new word indices_to_delete = idx[1:] transformed_texts.append( current_text.replace_words_at_indices( [idx[0]] + indices_to_delete, [r] + [""] * len(indices_to_delete), ) ) if self.consistent: # Delete this word to mark it as replaced del location_to_indices[word] return transformed_texts def _get_new_location(self, word): """Return a list of new locations, with the choice of country, nationality, and city.""" language = "" if self.language == "esp" or self.language == "spanish": language = "-spanish" elif self.language == "fra" or self.language == "french": language = "-french" if word in NAMED_ENTITIES["country" + language]: return np.random.choice(NAMED_ENTITIES["country" + language], self.n) elif word in NAMED_ENTITIES["nationality" + language]: return np.random.choice(NAMED_ENTITIES["nationality" + language], self.n) elif word in NAMED_ENTITIES["city"]: return np.random.choice(NAMED_ENTITIES["city"], self.n) return [] def _capitalize(self, string): """Capitalizes all words in the string.""" return " ".join(word.capitalize() for word in string.split()) def _build_location_to_indicies_map(self, location_words, text): """Returns a map of each location and the starting indicies of all appearances of that location in the text.""" location_to_indices = defaultdict(list) if len(location_words) == 0: return location_to_indices location_words.sort( # Sort by the number of words in the location key=lambda index_location_pair: index_location_pair[0][-1] - index_location_pair[0][0] + 1, reverse=True, ) max_length = location_words[0][0][-1] - location_words[0][0][0] + 1 for idx, location in location_words: words_in_location = idx[-1] - idx[0] + 1 found = False location_start = idx[0] # Check each window of n words containing the original tagged location # for n from the max_length down to the original location length. # This prevents cases where the NER tagger misses a word in a location # (e.g. it does not tag "New" in "New York") for length in range(max_length, words_in_location, -1): for start in range( location_start - length + words_in_location, location_start + 1, ): if start + length > len(text.words): break expanded_location = self._capitalize( " ".join(text.words[start : start + length]) ) if expanded_location in location_to_indices: location_to_indices[expanded_location].append(start) found = True break if found: break if not found: location_to_indices[self._capitalize(location)].append(idx[0]) return location_to_indices