Source code for textattack.attack_recipes.bad_characters_2021

"""
Imperceptible Perturbations Algorithm
======================================

"""

from textattack import Attack
from textattack.goal_functions import (
    LogitSum,
    MaximizeLevenshtein,
    MinimizeBleu,
    NamedEntityRecognition,
    TargetedBonus,
    TargetedClassification,
    TargetedStrict,
)
from textattack.search_methods import DifferentialEvolution
from textattack.transformations import (
    WordSwapDeletions,
    WordSwapHomoglyphSwap,
    WordSwapInvisibleCharacters,
    WordSwapReorderings,
)

from .attack_recipe import AttackRecipe


[docs]class BadCharacters2021(AttackRecipe):
    """Imperceptible Perturbations Attack Recipe
    =========================================

    Implements imperceptible adversarial attacks on NLP models as outlined in the
    `Bad Characters paper <https://arxiv.org/abs/2106.09898>`_.

    This recipe combines imperceptible transformations with the Differential Evolution
    search method. It supports a variety of goal functions (targeted, untargeted,
    NER, translation) and several types of character-level perturbations.

    **Transformations supported:**

    - ``WordSwapInvisibleCharacters``: injects invisible Unicode characters
    - ``WordSwapHomoglyphSwap``: replaces characters with homoglyphs
    - ``WordSwapDeletions``: inserts deletion control characters
    - ``WordSwapReorderings``: inserts reordering control characters

    **Goal functions supported:**

    - ``TargetedClassification``
    - ``TargetedStrict``
    - ``TargetedBonus``
    - ``LogitSum`` (for logits-based classifiers like toxic comment detection)
    - ``MinimizeBleu`` (translation BLEU score minimization)
    - ``MaximizeLevenshtein`` (translation Levenshtein distance maximization)

    All transformations are compatible with all goal functions.

    Note:
    This recipe assumes the model wrapper is compatible with the goal function chosen.
    For example, a ``NamedEntityRecognition`` goal function expects a model wrapper
    that outputs a list of dictionaries per input, while ``LogitSum`` expects an array of logits.
    """

[docs]    @staticmethod
    def build(
        model_wrapper,
        goal_function_type: str,
        perturbation_type: str = None,
        allow_skip: bool = False,
        perturbs=1,
        popsize=32,
        maxiter=10,
        **goal_function_kwargs
    ):
        """Builds an imperceptible attack instance.

        Parameters
        ----------
        model_wrapper : ModelWrapper
            A TextAttack model wrapper compatible with the selected goal function.
        goal_function_type : str, optional
            Goal function type. One of:

            - ``"targeted_classification"``: targeted attack on a classification model (default).
            - ``"targeted_strict"``: stricter targeted attack.
            - ``"targeted_bonus"``: bonus if prediction for target class is highest.
            - ``"named_entity_recognition"``: token-level NER attack.
            - ``"logit_sum"``: untargeted attack minimizing total logits.
            - ``"minimize_bleu"``: translation attack minimizing BLEU.
            - ``"maximize_levenshtein"``: translation attack maximizing Levenshtein distance.
        perturbation_type : str, optional
            Type of character-level perturbation. One of:

            - ``"homoglyphs"`` (default)
            - ``"invisible"``
            - ``"deletions"``
            - ``"reorderings"``
        allow_skip : bool
            If False, the attack will continue even if the goal is already satisfied.
        perturbs : int
            Maximum number of perturbations allowed per input string.
        popsize : int
            Population size for differential evolution. Typically 32.
        maxiter : int
            Maximum number of generations for differential evolution. Typically 10.
        **goal_function_kwargs : dict
            Additional arguments passed to the goal function.

        Returns
        -------
        textattack.Attack
            Configured Attack instance.
        """

        if goal_function_type == "targeted_classification":
            """Defaults to TargetedClassification.

            **goal_function_kwargs:
            - target_class: int = 0
            """
            goal_function = TargetedClassification(
                model_wrapper, allow_skip=allow_skip, **goal_function_kwargs
            )
        elif goal_function_type == "targeted_strict":
            """Pass in a model wrapper that returns an array of probabilities.

            **goal_function_kwargs:
            - target_class: int = 0
            """
            goal_function = TargetedStrict(
                model_wrapper, allow_skip=allow_skip, **goal_function_kwargs
            )
        elif goal_function_type == "targeted_bonus":
            """Pass in a model wrapper that returns an array of probabilities.

            **goal_function_kwargs:
            - target_class: int = 0
            """
            goal_function = TargetedBonus(
                model_wrapper, allow_skip=allow_skip, **goal_function_kwargs
            )
        elif goal_function_type == "named_entity_recognition":
            """Pass in a model wrapper that returns a list of dictionaries each
            containing 'entity' and 'score' keys.

            **goal_function_kwargs:
            - target_suffix: str (no default value; must specify)
            """
            goal_function = NamedEntityRecognition(
                model_wrapper, allow_skip=allow_skip, **goal_function_kwargs
            )
        elif goal_function_type == "logit_sum":
            """Pass in a model wrapper that returns an array of logits.

            **goal_function_kwargs:
            - target_logit_sum=None
            - first_element_threshold=None
            Error if both are specified. If neither is specified, first_element_threshold is set to 0.5.
            """
            goal_function = LogitSum(
                model_wrapper, allow_skip=allow_skip, **goal_function_kwargs
            )
        elif goal_function_type == "minimize_bleu":
            """Pass in a model wrapper that returns a string.

            **goal_function_kwargs:
            - target_bleu: float=0.0
            """
            goal_function = MinimizeBleu(
                model_wrapper, allow_skip=allow_skip, **goal_function_kwargs
            )
        elif goal_function_type == "maximize_levenshtein":
            """Pass in a model wrapper that returns a string.

            **goal_function_kwargs:
            - target_distance: float=None
            """
            goal_function = MaximizeLevenshtein(
                model_wrapper, allow_skip=allow_skip, **goal_function_kwargs
            )
        else:
            raise ValueError("Invalid goal_function_type!")

        if perturbation_type is None:
            # Default to homoglyphs
            transformation = WordSwapHomoglyphSwap()
        elif perturbation_type == "homoglyphs":
            transformation = WordSwapHomoglyphSwap()
        elif perturbation_type == "invisible":
            transformation = WordSwapInvisibleCharacters()
        elif perturbation_type == "deletions":
            transformation = WordSwapDeletions()
        elif perturbation_type == "reorderings":
            transformation = WordSwapReorderings()
        else:
            raise ValueError("Invalid perturbation_type!")

        search_method = DifferentialEvolution(
            popsize=popsize, maxiter=maxiter, verbose=False, max_perturbs=perturbs
        )

        constraints = []

        return Attack(goal_function, constraints, transformation, search_method)