Source code for textattack.attack_recipes.faster_genetic_algorithm_jia_2019

"""

Faster Alzantot Genetic Algorithm
===================================
(Certified Robustness to Adversarial Word Substitutions)


"""

from textattack import Attack
from textattack.constraints.grammaticality.language_models import (
    LearningToWriteLanguageModel,
)
from textattack.constraints.overlap import MaxWordsPerturbed
from textattack.constraints.pre_transformation import (
    RepeatModification,
    StopwordModification,
)
from textattack.constraints.semantics import WordEmbeddingDistance
from textattack.goal_functions import UntargetedClassification
from textattack.search_methods import AlzantotGeneticAlgorithm
from textattack.transformations import WordSwapEmbedding

from .attack_recipe import AttackRecipe


[docs]class FasterGeneticAlgorithmJia2019(AttackRecipe): """Certified Robustness to Adversarial Word Substitutions. Robin Jia, Aditi Raghunathan, Kerem Göksel, Percy Liang (2019). https://arxiv.org/pdf/1909.00986.pdf """
[docs] @staticmethod def build(model_wrapper): # # Section 5: Experiments # # We base our sets of allowed word substitutions S(x, i) on the # substitutions allowed by Alzantot et al. (2018). They demonstrated that # their substitutions lead to adversarial examples that are qualitatively # similar to the original input and retain the original label, as judged # by humans. Alzantot et al. (2018) define the neighbors N(w) of a word w # as the n = 8 nearest neighbors of w in a “counter-fitted” word vector # space where antonyms are far apart (Mrksiˇ c´ et al., 2016). The # neighbors must also lie within some Euclidean distance threshold. They # also use a language model constraint to avoid nonsensical perturbations: # they allow substituting xi with x˜i ∈ N(xi) if and only if it does not # decrease the log-likelihood of the text under a pre-trained language # model by more than some threshold. # # We make three modifications to this approach: # # First, in Alzantot et al. (2018), the adversary # applies substitutions one at a time, and the # neighborhoods and language model scores are computed. # Equation (4) must be applied before the model # can combine information from multiple words, but it can # be delayed until after processing each word independently. # Note that the model itself classifies using a different # set of pre-trained word vectors; the counter-fitted vectors # are only used to define the set of allowed substitution words. # relative to the current altered version of the input. # This results in a hard-to-define attack surface, as # changing one word can allow or disallow changes # to other words. It also requires recomputing # language model scores at each iteration of the genetic # attack, which is inefficient. Moreover, the same # word can be substituted multiple times, leading # to semantic drift. We define allowed substitutions # relative to the original sentence x, and disallow # repeated substitutions. # # Second, we use a faster language model that allows us to query # longer contexts; Alzantot et al. (2018) use a slower language # model and could only query it with short contexts. # Finally, we use the language model constraint only # at test time; the model is trained against all perturbations in N(w). This encourages the model to be # robust to a larger space of perturbations, instead of # specializing for the particular choice of language # model. See Appendix A.3 for further details. [This is a model-specific # adjustment, so does not affect the attack recipe.] # # Appendix A.3: # # In Alzantot et al. (2018), the adversary applies replacements one at a # time, and the neighborhoods and language model scores are computed # relative to the current altered version of the input. This results in a # hard-to-define attack surface, as the same word can be replaced many # times, leading to semantic drift. We instead pre-compute the allowed # substitutions S(x, i) at index i based on the original x. We define # S(x, i) as the set of x_i ∈ N(x_i) such that where probabilities are # assigned by a pre-trained language model, and the window radius W and # threshold δ are hyperparameters. We use W = 6 and δ = 5. # # # Swap words with their embedding nearest-neighbors. # # Embedding: Counter-fitted Paragram Embeddings. # # "[We] fix the hyperparameter values to S = 60, N = 8, K = 4, and δ = 0.5" # transformation = WordSwapEmbedding(max_candidates=8) # # Don't modify the same word twice or stopwords # constraints = [RepeatModification(), StopwordModification()] # # Maximum words perturbed percentage of 20% # constraints.append(MaxWordsPerturbed(max_percent=0.2)) # # Maximum word embedding euclidean distance of 0.5. # constraints.append(WordEmbeddingDistance(max_mse_dist=0.5)) # # Language Model # # # constraints.append( LearningToWriteLanguageModel( window_size=6, max_log_prob_diff=5.0, compare_against_original=True ) ) # constraints.append(LearningToWriteLanguageModel(window_size=5)) # # Goal is untargeted classification # goal_function = UntargetedClassification(model_wrapper) # # Perform word substitution with a genetic algorithm. # search_method = AlzantotGeneticAlgorithm( pop_size=60, max_iters=40, post_crossover_check=False ) return Attack(goal_function, constraints, transformation, search_method)