Source code for textattack.attack_recipes.bad_characters_2021

"""
Imperceptible Perturbations Algorithm
======================================

"""

from textattack import Attack
from textattack.goal_functions import (
    LogitSum,
    MaximizeLevenshtein,
    MinimizeBleu,
    NamedEntityRecognition,
    TargetedBonus,
    TargetedClassification,
    TargetedStrict,
)
from textattack.search_methods import DifferentialEvolution
from textattack.transformations import (
    WordSwapDeletions,
    WordSwapHomoglyphSwap,
    WordSwapInvisibleCharacters,
    WordSwapReorderings,
)

from .attack_recipe import AttackRecipe


[docs]class BadCharacters2021(AttackRecipe): """Imperceptible Perturbations Attack Recipe ========================================= Implements imperceptible adversarial attacks on NLP models as outlined in the `Bad Characters paper <https://arxiv.org/abs/2106.09898>`_. This recipe combines imperceptible transformations with the Differential Evolution search method. It supports a variety of goal functions (targeted, untargeted, NER, translation) and several types of character-level perturbations. **Transformations supported:** - ``WordSwapInvisibleCharacters``: injects invisible Unicode characters - ``WordSwapHomoglyphSwap``: replaces characters with homoglyphs - ``WordSwapDeletions``: inserts deletion control characters - ``WordSwapReorderings``: inserts reordering control characters **Goal functions supported:** - ``TargetedClassification`` - ``TargetedStrict`` - ``TargetedBonus`` - ``LogitSum`` (for logits-based classifiers like toxic comment detection) - ``MinimizeBleu`` (translation BLEU score minimization) - ``MaximizeLevenshtein`` (translation Levenshtein distance maximization) All transformations are compatible with all goal functions. Note: This recipe assumes the model wrapper is compatible with the goal function chosen. For example, a ``NamedEntityRecognition`` goal function expects a model wrapper that outputs a list of dictionaries per input, while ``LogitSum`` expects an array of logits. """
[docs] @staticmethod def build( model_wrapper, goal_function_type: str, perturbation_type: str = None, allow_skip: bool = False, perturbs=1, popsize=32, maxiter=10, **goal_function_kwargs ): """Builds an imperceptible attack instance. Parameters ---------- model_wrapper : ModelWrapper A TextAttack model wrapper compatible with the selected goal function. goal_function_type : str, optional Goal function type. One of: - ``"targeted_classification"``: targeted attack on a classification model (default). - ``"targeted_strict"``: stricter targeted attack. - ``"targeted_bonus"``: bonus if prediction for target class is highest. - ``"named_entity_recognition"``: token-level NER attack. - ``"logit_sum"``: untargeted attack minimizing total logits. - ``"minimize_bleu"``: translation attack minimizing BLEU. - ``"maximize_levenshtein"``: translation attack maximizing Levenshtein distance. perturbation_type : str, optional Type of character-level perturbation. One of: - ``"homoglyphs"`` (default) - ``"invisible"`` - ``"deletions"`` - ``"reorderings"`` allow_skip : bool If False, the attack will continue even if the goal is already satisfied. perturbs : int Maximum number of perturbations allowed per input string. popsize : int Population size for differential evolution. Typically 32. maxiter : int Maximum number of generations for differential evolution. Typically 10. **goal_function_kwargs : dict Additional arguments passed to the goal function. Returns ------- textattack.Attack Configured Attack instance. """ if goal_function_type == "targeted_classification": """Defaults to TargetedClassification. **goal_function_kwargs: - target_class: int = 0 """ goal_function = TargetedClassification( model_wrapper, allow_skip=allow_skip, **goal_function_kwargs ) elif goal_function_type == "targeted_strict": """Pass in a model wrapper that returns an array of probabilities. **goal_function_kwargs: - target_class: int = 0 """ goal_function = TargetedStrict( model_wrapper, allow_skip=allow_skip, **goal_function_kwargs ) elif goal_function_type == "targeted_bonus": """Pass in a model wrapper that returns an array of probabilities. **goal_function_kwargs: - target_class: int = 0 """ goal_function = TargetedBonus( model_wrapper, allow_skip=allow_skip, **goal_function_kwargs ) elif goal_function_type == "named_entity_recognition": """Pass in a model wrapper that returns a list of dictionaries each containing 'entity' and 'score' keys. **goal_function_kwargs: - target_suffix: str (no default value; must specify) """ goal_function = NamedEntityRecognition( model_wrapper, allow_skip=allow_skip, **goal_function_kwargs ) elif goal_function_type == "logit_sum": """Pass in a model wrapper that returns an array of logits. **goal_function_kwargs: - target_logit_sum=None - first_element_threshold=None Error if both are specified. If neither is specified, first_element_threshold is set to 0.5. """ goal_function = LogitSum( model_wrapper, allow_skip=allow_skip, **goal_function_kwargs ) elif goal_function_type == "minimize_bleu": """Pass in a model wrapper that returns a string. **goal_function_kwargs: - target_bleu: float=0.0 """ goal_function = MinimizeBleu( model_wrapper, allow_skip=allow_skip, **goal_function_kwargs ) elif goal_function_type == "maximize_levenshtein": """Pass in a model wrapper that returns a string. **goal_function_kwargs: - target_distance: float=None """ goal_function = MaximizeLevenshtein( model_wrapper, allow_skip=allow_skip, **goal_function_kwargs ) else: raise ValueError("Invalid goal_function_type!") if perturbation_type is None: # Default to homoglyphs transformation = WordSwapHomoglyphSwap() elif perturbation_type == "homoglyphs": transformation = WordSwapHomoglyphSwap() elif perturbation_type == "invisible": transformation = WordSwapInvisibleCharacters() elif perturbation_type == "deletions": transformation = WordSwapDeletions() elif perturbation_type == "reorderings": transformation = WordSwapReorderings() else: raise ValueError("Invalid perturbation_type!") search_method = DifferentialEvolution( popsize=popsize, maxiter=maxiter, verbose=False, max_perturbs=perturbs ) constraints = [] return Attack(goal_function, constraints, transformation, search_method)