https://gricad-gitlab.univ-grenoble-alpes.fr/coavouxm/flaubertagger.git
Tip revision: c939ca9fac094ac3c379256ef3d3d4d14a5a4bf1 authored by m on 23 February 2024, 16:44:50 UTC
up
up
Tip revision: c939ca9
sentence_encoders.py
import torch.nn as nn
import torch.nn.init
import torch.nn.functional as F
from word_encoders import Words2Tensors
from word_encoders import WordEmbedder
class SentenceLSTM(nn.Module):
def __init__(self, args, d_input):
super(SentenceLSTM, self).__init__()
self.args = args
self.word_transducer_l1 = nn.LSTM(input_size=d_input,
hidden_size=args.lstm_dim //2,
num_layers=1,
batch_first=True,
bidirectional=True)
self.word_transducer_l2 = nn.LSTM(input_size=args.lstm_dim,
hidden_size=args.lstm_dim //2,
num_layers=1,
batch_first=True,
bidirectional=True)
# self.residual = args.lstm_residual_connection
# self.ln = [int(i) for i in args.lstm_layer_norm]
# self.layer_norm_l0 = nn.LayerNorm(d_input)
# self.layer_norm_l1 = nn.LayerNorm(args.lstm_dim)
# self.layer_norm_l2 = nn.LayerNorm(args.lstm_dim)
def forward(self, all_embeddings, lengths, order=None):
# if self.ln[0]:
# all_embeddings = [self.layer_norm_l0(e) for e in all_embeddings]
padded_embeddings = torch.nn.utils.rnn.pad_sequence(all_embeddings, batch_first=True)
packed_padded_char_based_embeddings = torch.nn.utils.rnn.pack_padded_sequence(
padded_embeddings, lengths, batch_first=True)
output_l1, (h_n, c_n) = self.word_transducer_l1(packed_padded_char_based_embeddings)
output_l2, (h_n, c_n) = self.word_transducer_l2(output_l1)
#unpacked_l1, _ = torch.nn.utils.rnn.pad_packed_sequence(output_l1, batch_first=True)
unpacked_l2, _ = torch.nn.utils.rnn.pad_packed_sequence(output_l2, batch_first=True)
# unpacked_l1 = [t.squeeze(0) for t in unpacked_l1.split([1 for l in lengths], dim=0)]
# unpacked_l1 = [t[1:l-1,:] for t, l in zip(unpacked_l1, lengths)]
# if self.ln[1]:
# unpacked_l1 = [self.layer_norm_l1(l1) for l1 in unpacked_l1]
unpacked_l2 = [t.squeeze(0) for t in unpacked_l2.split([1 for l in lengths], dim=0)]
unpacked_l2 = [t[:l,:] for t, l in zip(unpacked_l2, lengths)]
# if self.ln[2]:
# unpacked_l2 = [self.layer_norm_l2(l2) for l2 in unpacked_l2]
return unpacked_l2
class HierarchicalLSTM(nn.Module):
# args: Word2Tensors (self, char2i, word2i, pchar, pword):
# args: WordEmbedder (args, num_words, num_chars, word2tensors)
# args: CharacterLstmLayer (emb_dim, voc_size, out_dim, words2tensors=None, dropout=0.2, embed_init=0.1):
# args:
def __init__(self, args, char2i, word2i):
super(HierarchicalLSTM, self).__init__()
words2tensors = Words2Tensors(char2i, word2i, args.lstm_drop_char, args.lstm_drop_word)
self.embedder = WordEmbedder(args, len(word2i), len(char2i), words2tensors)
d_input = 0
if not args.no_char:
d_input += args.C
if not args.no_word:
d_input += args.w
self.lstm = SentenceLSTM(args, d_input)
self.dim = args.lstm_dim
def forward(self, sentences, batch=True):
indices, sorted_sentences = zip(*sorted(zip(range(len(sentences)),
sentences),
key = lambda x: len(x[1]),
reverse=True))
embeddings, lengths = self.embedder(sorted_sentences)
output_encoder = self.lstm(embeddings, lengths)
# reorder
_, encoded_sentences = zip(*sorted(zip(indices, output_encoder), key = lambda x:x[0]))
return encoded_sentences
@staticmethod
def add_cmd_options(cmd):
cmd.add_argument("--lstm-dim", "-W", metavar="dim", type=int, default=400, help="Dimension of sentence bi-LSTM")
cmd.add_argument("--lstm-dim-word", "-w", dest="w", metavar="dim", type=int, default=50, help="Dimension of word embeddings")
cmd.add_argument("--lstm-dim-char-based", "-C", dest="C", metavar="dim", type=int, default=100, help="Dimension of char based embeddings ")
cmd.add_argument("--lstm-dim-char", "-c", dest="c", metavar="dim", type=int, default=100, help="Dimension of char embeddings")
cmd.add_argument("--lstm-no-char", dest="no_char", action="store_true", help="don't use character based embeddings")
cmd.add_argument("--lstm-no-word", dest="no_word", action="store_true", help="don't use word embeddings")
cmd.add_argument("--lstm-init", "-I", dest="I", type=float, default=0.01, help="Embedding initialization")
cmd.add_argument("--lstm-drop-char", type=float, default=0.2, help="Char dropout")
cmd.add_argument("--lstm-drop-word", type=float, default=0.2, help="Word dropout")
# cmd.add_argument("--lstm-layers", "-P", metavar="l", type=int, default=2, help="Depth of word transducer, min=2")
# cmd.add_argument("--lstm-residual-connection", metavar=" ", type=bool, default=True, help="Add residual connections between LSTM layers")
# cmd.add_argument("--lstm-layer-norm", type=str, choices=["000", "001", "010", "100", "110", "101", "011", "111"], default="001",
# help="Add layer normalizations at <input> <output l1> <output l2>")
if __name__ == "__main__":
from collections import defaultdict
import argparse
sentences = ["Influential members of the House Ways and Means Committee introduced legislation that would restrict how the new savings-and-loan bailout agency can raise capital , creating another potential obstacle to the government 's sale of sick thrifts .", "The bill , whose backers include Chairman Dan Rostenkowski -LRB- D. , Ill. -RRB- , would prevent the Resolution Trust Corp. from raising temporary working capital by having an RTC-owned bank or thrift issue debt that would n't be counted on the federal budget ."]
sentences = [sent.split() for sent in sentences]
voc = []
chars = []
for sentence in sentences:
for token in sentence:
voc.append(token)
for char in token:
chars.append(char)
i2word = ["<PAD>", "<UNK>"] + sorted(set(voc))
i2char = ["<PAD>", "<UNK>", "<START>", "<STOP>", "-LRB-", "-RRB-"] + sorted(set(chars))
char2i = {c:i for i,c in enumerate(i2char)}
word2i = {w:i for i,w in enumerate(i2word)}
words2tensors = Words2Tensors(char2i, word2i, 0.2, 0.2)
# print(words2tensors.w2tensor["members"])
# char_lstm = CharacterLstmLayer(30, len(i2char), 20, words2tensors=words2tensors)
# embed_sentence = char_lstm(sentences[0])
# print(embed_sentence.shape)
args = argparse.Namespace()
args.c = 10
args.C = 12
args.I = 0.01
args.w = 15
args.lstm_dim = 30
args.no_char = False
args.no_word = False
args.lstm_drop_char = 0.2
args.lstm_drop_word = 0.2
pchar = 0.2
pword = 0.2
embedder = HierarchicalLSTM(args, char2i, word2i)
output = embedder(sentences)
print([i.shape for i in output])