Commit a427e2ec authored by Rohit Prasad's avatar Rohit Prasad

Initial commit

parents
import torch
MAX_LENGTH = 200
hidden_size = 256
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cuda")
SOS_token = 0
EOS_token = 1
UNK_token = 2
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import pickle
import sys
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np
from modules import EncoderRNN, AttnDecoderRNN
from helpers import prepareData, tensorFromSentence
from config import *
random.seed(42)
testing_file_name = sys.argv[1]
input_lang = None
output_lang = None
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
with torch.no_grad():
input_tensor = tensorFromSentence(input_lang, sentence)
input_length = input_tensor.size()[0]
encoder_hidden = encoder.initHidden()
encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
for ei in range(input_length):
encoder_output, encoder_hidden = encoder(input_tensor[ei],
encoder_hidden)
encoder_outputs[ei] += encoder_output[0, 0]
decoder_input = torch.tensor([[SOS_token]], device=device) # SOS
decoder_hidden = encoder_hidden
decoded_words = []
decoder_attentions = torch.zeros(max_length, max_length)
for di in range(max_length):
decoder_output, decoder_hidden, decoder_attention = decoder(
decoder_input, decoder_hidden, encoder_outputs)
decoder_attentions[di] = decoder_attention.data
topv, topi = decoder_output.data.topk(1)
if topi.item() == EOS_token:
decoded_words.append('<EOS>')
break
else:
decoded_words.append(output_lang.index2word[topi.item()])
decoder_input = topi.squeeze().detach()
return decoded_words, decoder_attentions[:di + 1]
def restoreInputOutputLang():
global input_lang, output_lang
input_lang = pickle.load(open('input_lang.pickle', 'rb'))
output_lang = pickle.load(open('output_lang.pickle', 'rb'))
def evaluateRandomly(encoder, decoder, pairs, n=10):
restoreInputOutputLang()
for i in range(n):
pair = random.choice(pairs)
print('>', pair[0])
print('=', pair[1])
output_words, attentions = evaluate(encoder, decoder, pair[0])
output_sentence = ' '.join(output_words)
print('<', output_sentence)
print('')
def showAttention(input_sentence, output_words, attentions, id):
# Set up figure with colorbar
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(attentions.numpy(), cmap='bone')
fig.colorbar(cax)
# Set up axes
ax.set_xticklabels([''] + input_sentence.split(' ') +
['<EOS>'], rotation=90)
ax.set_yticklabels([''] + output_words)
# Show label at every tick
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
plt.savefig('attention' + str(id) + '.png')
def evaluateAndShowAttention(input_sentence, id):
output_words, attentions = evaluate(
encoder1, attn_decoder1, input_sentence)
print('input =', input_sentence)
print('output =', ' '.join(output_words))
showAttention(input_sentence, output_words, attentions, id)
def evaluateTestSet(encoder, decoder):
predictionsFile = open('predictions.txt', 'w')
for i in range(len(pairs)):
pair = pairs[i]
try:
output_words, attentions = evaluate(encoder, decoder, pair[0])
except:
continue
output_sentence = ' '.join(output_words[:-1])
if i % 1000 == 0:
print(i, pair[1] + '\t' + output_sentence + '\n')
predictionsFile.write(pair[1] + '\t' + output_sentence + '\n')
if __name__ == '__main__':
print("HERE")
encoder1 = torch.load('encoder.network')
attn_decoder1 = torch.load('attn_decoder.network')
_, _, pairs = prepareData('unl', 'eng', testing_file_name)
restoreInputOutputLang()
print(pairs)
evaluateRandomly(encoder1, attn_decoder1, pairs)
# evaluateAndShowAttention("man stay together", 1)
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import sys
from config import *
class Lang:
def __init__(self, name):
self.name = name
self.word2index = {}
self.word2count = {}
self.index2word = {0: "SOS", 1: "EOS", 2: "UNK"}
self.n_words = 3 # Count SOS and EOS
def addSentence(self, sentence):
for word in sentence.split(' '):
self.addWord(word)
def addWord(self, word):
if word not in self.word2index:
self.word2index[word] = self.n_words
self.word2count[word] = 1
self.index2word[self.n_words] = word
self.n_words += 1
else:
self.word2count[word] += 1
def unicodeToAscii(s):
return ''.join(
c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn'
)
# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
s = unicodeToAscii(s.lower().strip())
s = re.sub(r"([.!?])", r" \1", s)
s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
return s
def readLangs(lang1, lang2, training_file_name):
print("Reading lines...")
# Read the file and split into lines
lines = open('%s' % (training_file_name), encoding='utf-8').\
read().strip().split('\n')
# Split every line into pairs and normalize
pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
input_lang = Lang(lang1)
output_lang = Lang(lang2)
return input_lang, output_lang, pairs
def filterPair(p):
return len(p[0].split(' ')) < MAX_LENGTH and \
len(p[1].split(' ')) < MAX_LENGTH
def filterPairs(pairs):
return [pair for pair in pairs if filterPair(pair)]
def prepareData(lang1, lang2, training_file_name):
input_lang, output_lang, pairs = readLangs(lang1, lang2, training_file_name)
print("Read %s sentence pairs" % len(pairs))
pairs = filterPairs(pairs)
print("Trimmed to %s sentence pairs" % len(pairs))
print("Counting words...")
for pair in pairs:
input_lang.addSentence(pair[0])
output_lang.addSentence(pair[1])
print("Counted words:")
print(input_lang.name, input_lang.n_words)
print(output_lang.name, output_lang.n_words)
return input_lang, output_lang, pairs
def indexesFromSentence(lang, sentence):
return [lang.word2index[word] if word in lang.word2index else UNK_token for word in sentence.split(' ')]
def tensorFromSentence(lang, sentence):
indexes = indexesFromSentence(lang, sentence)
indexes.append(EOS_token)
return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)
def tensorsFromPair(pair, input_lang, output_lang):
input_tensor = tensorFromSentence(input_lang, pair[0])
target_tensor = tensorFromSentence(output_lang, pair[1])
return (input_tensor, target_tensor)
\ No newline at end of file
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import pickle
import sys
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from config import *
class EncoderRNN(nn.Module):
def __init__(self, input_size, hidden_size):
super(EncoderRNN, self).__init__()
self.hidden_size = hidden_size
self.embedding = nn.Embedding(input_size, hidden_size)
self.gru = nn.GRU(hidden_size, hidden_size)
def forward(self, input, hidden):
embedded = self.embedding(input).view(1, 1, -1)
output = embedded
output, hidden = self.gru(output, hidden)
return output, hidden
def initHidden(self):
return torch.zeros(1, 1, self.hidden_size, device=device)
class AttnDecoderRNN(nn.Module):
def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
super(AttnDecoderRNN, self).__init__()
self.hidden_size = hidden_size
self.output_size = output_size
self.dropout_p = dropout_p
self.max_length = max_length
self.embedding = nn.Embedding(self.output_size, self.hidden_size)
self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
self.dropout = nn.Dropout(self.dropout_p)
self.gru = nn.GRU(self.hidden_size, self.hidden_size)
self.out = nn.Linear(self.hidden_size, self.output_size)
def forward(self, input, hidden, encoder_outputs):
embedded = self.embedding(input).view(1, 1, -1)
embedded = self.dropout(embedded)
attn_weights = F.softmax(
self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
attn_applied = torch.bmm(attn_weights.unsqueeze(0),
encoder_outputs.unsqueeze(0))
output = torch.cat((embedded[0], attn_applied[0]), 1)
output = self.attn_combine(output).unsqueeze(0)
output = F.relu(output)
output, hidden = self.gru(output, hidden)
output = F.log_softmax(self.out(output[0]), dim=1)
return output, hidden, attn_weights
def initHidden(self):
return torch.zeros(1, 1, self.hidden_size, device=device)
import re
import random
import pickle
import sys
from collections import OrderedDict
random.seed(42)
filename = sys.argv[1]
output_filename = sys.argv[2]
input_type = sys.argv[3]
file = open("data/" + filename, 'r')
content = file.read()
file.close()
###########
# Example UNL input
# [S:1]
# {org:en}
# Let's try something.
# {/org}
# {unl}
# cag(let(icl>offer>do,com>modality,obj>uw,cag>person).@entry.@imperative,we(icl>group).@pl)
# obj:01(try(icl>attempt>do,cob>thing,agt>volitional_thing,obj>uw).@entry,something(icl>thing>thing))
# obj(let(icl>offer>do,com>modality,obj>uw,cag>person).@entry.@imperative,:01)
# {/unl}
# [/S]
###########
# matches [S]..[/S]
regexUnlDocumentSentenceRegex = re.compile("\[S.*?\].*?\[/s\]", re.M|re.S|re.I)
regexUnl = re.compile("{unl}.*?{/unl}", re.M|re.S|re.I)
regexSentence = re.compile("{org:en}.*?{/org}", re.M|re.S|re.I)
documentSentences = re.findall(regexUnlDocumentSentenceRegex, content)
unlExp = re.compile("(.*?)\((.+?)(\(.*\))?([:.].*?)?,(.+?)(\(.*\))?([:.].*?)?\)", re.M|re.S|re.I)
unlWords = re.compile("\w+");
unlInput = []
sentenceOutput = []
for documentSentence in documentSentences:
unlMatch = re.findall(regexUnl, documentSentence)
sentenceMatch = re.findall(regexSentence, documentSentence)
if (len(unlMatch) != 0):
unlMatch = unlMatch[0]
sentenceMatch = ' '.join(sentenceMatch[0].split('\n')[1:-1])
if (len(sentenceMatch.split()) > 10):
continue
try:
unlMatch.encode('utf-8')
except:
continue
lines = unlMatch.split('\n')[1:-1]
_unlMatches = []
for line in lines:
relationMatches = re.search(unlExp, line)
relationWords = re.findall(unlWords, line)
if input_type == 'all':
_unlMatches.append(' '.join(relationWords))
else:
if not relationMatches:
continue
relation = relationMatches.group(1)
uw1 = relationMatches.group(2).replace('"', '')
uw2 = relationMatches.group(5).replace('"', '')
_unlMatches += [relation, uw1, uw2]
unlInput.append(' '.join(_unlMatches))
sentenceOutput.append(sentenceMatch)
seq2seqInputs = []
unlInputLength = {}
for unl,sentence in zip(unlInput, sentenceOutput):
length = len(unl.split())
if length not in unlInputLength:
unlInputLength[length] = 0
unlInputLength[length] += 1
if (length > 400):
print(length, sentence)
seq2seqInputs.append(unl + '\t' + sentence)
unlInputLength = OrderedDict(sorted(unlInputLength.items()))
print(unlInputLength)
seq2seqInputFile = open("data/" + 'seq2seq-input', 'w')
seq2seqInputFile.write('\n'.join(seq2seqInputs))
seq2seqInputFile.close()
########
# Random split of seq2seq input
########
trainFile = open("data/" + '{}-training.txt'.format(output_filename), 'w')
testFile = open("data/" + '{}-testing.txt'.format(output_filename), 'w')
splitNum = int(len(seq2seqInputs) * 0.8)
random.shuffle(seq2seqInputs)
trainFile.write('\n'.join(seq2seqInputs[:splitNum]))
testFile.write('\n'.join(seq2seqInputs[splitNum:]))
trainFile.close()
testFile.close()
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import pickle
import sys
import itertools
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np
from config import *
from modules import EncoderRNN, AttnDecoderRNN
from helpers import prepareData, tensorsFromPair
from evaluate import evaluateRandomly
random.seed(42)
training_file_name = sys.argv[1]
input_lang, output_lang, pairs = prepareData('unl', 'eng', training_file_name)
pickle.dump(input_lang, open('input_lang.pickle', 'wb'))
pickle.dump(output_lang, open('output_lang.pickle', 'wb'))
print(random.choice(pairs))
teacher_forcing_ratio = 0.5
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
encoder_hidden = encoder.initHidden()
encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()
input_length = input_tensor.size(0)
target_length = target_tensor.size(0)
encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
loss = 0
for ei in range(input_length):
encoder_output, encoder_hidden = encoder(
input_tensor[ei], encoder_hidden)
encoder_outputs[ei] = encoder_output[0, 0]
decoder_input = torch.tensor([[SOS_token]], device=device)
decoder_hidden = encoder_hidden
use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
if use_teacher_forcing:
# Teacher forcing: Feed the target as the next input
for di in range(target_length):
decoder_output, decoder_hidden, decoder_attention = decoder(
decoder_input, decoder_hidden, encoder_outputs)
loss += criterion(decoder_output, target_tensor[di])
decoder_input = target_tensor[di] # Teacher forcing
else:
# Without teacher forcing: use its own predictions as the next input
for di in range(target_length):
decoder_output, decoder_hidden, decoder_attention = decoder(
decoder_input, decoder_hidden, encoder_outputs)
topv, topi = decoder_output.topk(1)
decoder_input = topi.squeeze().detach() # detach from history as input
loss += criterion(decoder_output, target_tensor[di])
if decoder_input.item() == EOS_token:
break
loss.backward()
encoder_optimizer.step()
decoder_optimizer.step()
return loss.item() / target_length
import time
import math
def asMinutes(s):
m = math.floor(s / 60)
s -= m * 60
return '%dm %ds' % (m, s)
def timeSince(since):
now = time.time()
s = now - since
return 'Elapsed time: %s' % (asMinutes(s))
def showPlot(points):
plt.figure()
fig, ax = plt.subplots()
# this locator puts ticks at regular intervals
loc = ticker.MultipleLocator(base=0.2)
ax.yaxis.set_major_locator(loc)
plt.plot(points)
plt.savefig('losses.png')
def trainIters(encoder, decoder, epoch, print_every=1000, plot_every=100, learning_rate=0.01):
start = time.time()
plot_losses = []
print_loss_total = 0 # Reset every print_every
plot_loss_total = 0 # Reset every plot_every
encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
training_pairs = [tensorsFromPair(pairs[i], input_lang, output_lang)
for i in range(len(pairs))]
criterion = nn.NLLLoss()
if epoch == None:
_range = itertools.count()
else:
_range = range(1, epoch+1)
try:
for _epoch in _range:
for _i in range(1, len(pairs)+1):
training_pair = training_pairs[_i-1]
input_tensor = training_pair[0]
target_tensor = training_pair[1]
loss = train(input_tensor, target_tensor, encoder,
decoder, encoder_optimizer, decoder_optimizer, criterion)
print_loss_total += loss
plot_loss_total += loss
if (_epoch*len(pairs) + _i) % print_every == 0:
print_loss_avg = print_loss_total / print_every
print_loss_total = 0
print('Epoch: %d\tIter: %d\t%s\tLoss: %.4f' % (_epoch+1, _i, timeSince(start), print_loss_avg))
if (_epoch*len(pairs) + _i) % 5000 == 0:
torch.save(encoder1, 'encoder.network')
torch.save(attn_decoder1, 'attn_decoder.network')
evaluateRandomly(encoder, decoder, pairs)
if (_epoch*len(pairs) + _i) % plot_every == 0:
plot_loss_avg = plot_loss_total / plot_every
plot_losses.append(plot_loss_avg)
plot_loss_total = 0
except KeyboardInterrupt:
None
showPlot(plot_losses)
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)
trainIters(encoder1, attn_decoder1, None, print_every=1000)
torch.save(encoder1, 'encoder.network')
torch.save(attn_decoder1, 'attn_decoder.network')
# output_words, attentions = evaluate(
# encoder1, attn_decoder1, "je suis trop froid .")
# plt.matshow(attentions.numpy())
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment