Source code for textattack.constraints.grammaticality.language_models.google_language_model.alzantot_goog_lm

"""

Google Language Models from Alzantot
--------------------------------------

    Author: Moustafa Alzantot (malzantot@ucla.edu)
    All rights reserved.
"""


import os

import lru
import numpy as np

from textattack.shared import utils

from . import lm_data_utils, lm_utils

tf = utils.LazyLoader("tensorflow", globals(), "tensorflow")


# @TODO automatically choose between GPU and CPU.


[docs]class GoogLMHelper: """An implementation of `<https://arxiv.org/abs/1804.07998>`_ adapted from `<https://github.com/nesl/nlp_adversarial_examples>`_.""" CACHE_PATH = "constraints/semantics/language-models/alzantot-goog-lm" def __init__(self): tf.get_logger().setLevel("INFO") lm_folder = utils.download_from_s3(GoogLMHelper.CACHE_PATH) self.PBTXT_PATH = os.path.join(lm_folder, "graph-2016-09-10-gpu.pbtxt") self.CKPT_PATH = os.path.join(lm_folder, "ckpt-*") self.VOCAB_PATH = os.path.join(lm_folder, "vocab-2016-09-10.txt") self.BATCH_SIZE = 1 self.NUM_TIMESTEPS = 1 self.MAX_WORD_LEN = 50 self.vocab = lm_data_utils.CharsVocabulary(self.VOCAB_PATH, self.MAX_WORD_LEN) with tf.device("/gpu:1"): self.graph = tf.Graph() self.sess = tf.compat.v1.Session(graph=self.graph) with self.graph.as_default(): self.t = lm_utils.LoadModel( self.sess, self.graph, self.PBTXT_PATH, self.CKPT_PATH ) self.lm_cache = lru.LRU(2**18)
[docs] def clear_cache(self): self.lm_cache.clear()
[docs] def get_words_probs_uncached(self, prefix_words, list_words): targets = np.zeros([self.BATCH_SIZE, self.NUM_TIMESTEPS], np.int32) weights = np.ones([self.BATCH_SIZE, self.NUM_TIMESTEPS], np.float32) if prefix_words.find("<S>") != 0: prefix_words = "<S> " + prefix_words prefix = [self.vocab.word_to_id(w) for w in prefix_words.split()] prefix_char_ids = [self.vocab.word_to_char_ids(w) for w in prefix_words.split()] inputs = np.zeros([self.BATCH_SIZE, self.NUM_TIMESTEPS], np.int32) char_ids_inputs = np.zeros( [self.BATCH_SIZE, self.NUM_TIMESTEPS, self.vocab.max_word_length], np.int32 ) samples = prefix[:] char_ids_samples = prefix_char_ids[:] inputs = [[samples[-1]]] char_ids_inputs[0, 0, :] = char_ids_samples[-1] softmax = self.sess.run( self.t["softmax_out"], feed_dict={ self.t["char_inputs_in"]: char_ids_inputs, self.t["inputs_in"]: inputs, self.t["targets_in"]: targets, self.t["target_weights_in"]: weights, }, ) words_ids = [self.vocab.word_to_id(w) for w in list_words] word_probs = [softmax[0][w_id] for w_id in words_ids] return np.array(word_probs)
[docs] def get_words_probs(self, prefix, list_words): """Retrieves the probability of words. Args: prefix_words list_words """ uncached_words = [] for word in list_words: if (prefix, word) not in self.lm_cache: if word not in uncached_words: uncached_words.append(word) probs = self.get_words_probs_uncached(prefix, uncached_words) for word, prob in zip(uncached_words, probs): self.lm_cache[prefix, word] = prob return [self.lm_cache[prefix, word] for word in list_words]
def __getstate__(self): state = self.__dict__.copy() state["lm_cache"] = self.lm_cache.get_size() return state def __setstate__(self, state): self.__dict__ = state self.lm_cache = lru.LRU(state["lm_cache"])