Source code for textattack.constraints.grammaticality.language_models.learning_to_write.language_model_helpers

"""
Language model helpers
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
"""

import os

import numpy as np
import torch

from textattack.shared.utils import LazyLoader

from .rnn_model import RNNModel

torchfile = LazyLoader("torchfile", globals(), "torchfile")


[docs]class QueryHandler:
    def __init__(self, model, word_to_idx, mapto, device):
        self.model = model
        self.word_to_idx = word_to_idx
        self.mapto = mapto
        self.device = device

[docs]    def query(self, sentences, swapped_words, batch_size=32):
        """Since we don't filter prefixes for OOV ahead of time, it's possible
        that some of them will have different lengths. When this is the case,
        we can't do RNN prediction in batch.

        This method _tries_ to do prediction in batch, and, when it
        fails, just does prediction sequentially and concatenates all of
        the results.
        """
        try:
            return self.try_query(sentences, swapped_words, batch_size=batch_size)
        except Exception:
            probs = []
            for s, w in zip(sentences, swapped_words):
                try:
                    probs.append(self.try_query([s], [w], batch_size=1)[0])
                except RuntimeError:
                    print(
                        "WARNING:  got runtime error trying languag emodel on language model w s/w",
                        s,
                        w,
                    )
                    probs.append(float("-inf"))
            return probs

[docs]    def try_query(self, sentences, swapped_words, batch_size=32):
        # TODO use caching
        sentence_length = len(sentences[0])
        if any(len(s) != sentence_length for s in sentences):
            raise ValueError("Only same length batches are allowed")

        log_probs = []
        for start in range(0, len(sentences), batch_size):
            swapped_words_batch = swapped_words[
                start : min(len(sentences), start + batch_size)
            ]
            batch = sentences[start : min(len(sentences), start + batch_size)]
            raw_idx_list = [[] for i in range(sentence_length + 1)]
            for i, s in enumerate(batch):
                s = [word for word in s if word in self.word_to_idx]
                words = ["<S>"] + s
                word_idxs = [self.word_to_idx[w] for w in words]
                for t in range(sentence_length + 1):
                    if t < len(word_idxs):
                        raw_idx_list[t].append(word_idxs[t])
            orig_num_idxs = len(raw_idx_list)
            raw_idx_list = [x for x in raw_idx_list if len(x)]
            num_idxs_dropped = orig_num_idxs - len(raw_idx_list)
            all_raw_idxs = torch.tensor(
                raw_idx_list, device=self.device, dtype=torch.long
            )
            word_idxs = self.mapto[all_raw_idxs]
            hidden = self.model.init_hidden(len(batch))
            source = word_idxs[:-1, :]
            target = word_idxs[1:, :]
            if (not len(source)) or not len(hidden):
                return [float("-inf")] * len(batch)
            decode, hidden = self.model(source, hidden)
            decode = decode.view(sentence_length - num_idxs_dropped, len(batch), -1)
            for i in range(len(batch)):
                if swapped_words_batch[i] not in self.word_to_idx:
                    log_probs.append(float("-inf"))
                else:
                    log_probs.append(
                        sum(
                            [
                                decode[t, i, target[t, i]].item()
                                for t in range(sentence_length - num_idxs_dropped)
                            ]
                        )
                    )
        return log_probs

[docs]    @staticmethod
    def load_model(lm_folder_path, device):
        word_map = torchfile.load(os.path.join(lm_folder_path, "word_map.th7"))
        word_map = [w.decode("utf-8") for w in word_map]
        word_to_idx = {w: i for i, w in enumerate(word_map)}
        word_freq = torchfile.load(
            os.path.join(os.path.join(lm_folder_path, "word_freq.th7"))
        )
        mapto = torch.from_numpy(util_reverse(np.argsort(-word_freq))).long().to(device)

        model_file = open(os.path.join(lm_folder_path, "lm-state-dict.pt"), "rb")

        model = RNNModel(
            "GRU",
            793471,
            256,
            2048,
            1,
            [4200, 35000, 180000, 793471],
            dropout=0.01,
            proj=True,
            lm1b=True,
        )

        model.load_state_dict(torch.load(model_file, map_location=device))
        model.full = True  # Use real softmax--important!
        model.to(device)
        model.eval()
        model_file.close()
        return QueryHandler(model, word_to_idx, mapto, device)


[docs]def util_reverse(item):
    new_item = np.zeros(len(item))
    for idx, val in enumerate(item):
        new_item[val] = idx
    return new_item