I want to use keras for authorship attribution. I have a list of (text,labels). I am trying to use the keras builtin vectorizer but I get the following error:
Vectorizing sequence data... Traceback (most recent call last): File "", line 1, in File "/home/angelo/org/courses/corpusling/finalproject/src/neuralnet.py", line 46, in X_train = tokenizer.texts_to_matrix(X_train, mode='binary') File "/home/angelo/org/courses/corpusling/finalproject/venv0/lib/python3.5/site-packages/keras/preprocessing/text.py", line 166, in texts_to_matrix sequences = self.texts_to_sequences(texts) File "/home/angelo/org/courses/corpusling/finalproject/venv0/lib/python3.5/site-packages/keras/preprocessing/text.py", line 131, in texts_to_sequences for vect in self.texts_to_sequences_generator(texts): File "/home/angelo/org/courses/corpusling/finalproject/venv0/lib/python3.5/site-packages/keras/preprocessing/text.py", line 150, in texts_to_sequences_generator i = self.word_index.get(w) AttributeError: 'Tokenizer' object has no attribute 'word_index'
The following is my code so far:
import glob
import os
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.preprocessing.text import Tokenizer
from keras.utils import np_utils
def get_label(filename):
tmp = os.path.split(filename)[0]
label = os.path.basename(tmp)
return label
def read_file(filename):
with open(filename) as f:
text = f.read()
return text
traindocs = "../data/C50/C50train/*/*.txt"
testdocs = "../data/C50/C50test/*/*.txt"
documents_train = (read_file(f) for f in glob.iglob(traindocs))
labels_train = (get_label(f) for f in glob.iglob(traindocs))
documents_test = (read_file(f) for f in glob.iglob(testdocs))
labels_test = (get_label(f) for f in glob.iglob(testdocs))
df_train = pd.DataFrame([documents_train, labels_train])
df_train = df_train.transpose()
df_train.rename(columns={0: 'text', 1: 'author'}, inplace=True)
df_test = pd.DataFrame([documents_test, labels_test])
df_test = df_test.transpose()
df_test.rename(columns={0: 'text', 1: 'author'}, inplace=True)
max_words = 1000
print('Vectorizing sequence data...')
tokenizer = Tokenizer(nb_words=max_words)
X_train, Y_train = df_train.text, df_train.author
X_test, Y_test = df_test.text, df_test.author
X_train = tokenizer.texts_to_matrix(X_train, mode='binary')
X_test = tokenizer.texts_to_matrix(X_test, mode='binary')
nb_classes = np.max(Y_train) + 1
print('Convert class vector to binary class matrix (for use with categorical_crossentropy)')
Y_train = np_utils.to_categorical(Y_train, nb_classes)
Y_test = np_utils.to_categorical(Y_test, nb_classes)
model = Sequential()
model.add(Dense(output_dim=512, input_dim=(max_words,)))
model.add(Activation("relu"))
model.add(Dense(output_dim=(np.max(Y_train)+1)))
model.add(Activation("softmax"))
model.compile(loss='categorical_crossentropy',
optimizer='sgd', metrics=['accuracy'])
model.fit(X_train, Y_train, nb_epoch=5, batch_size=32)
loss_and_metrics = model.evaluate(X_test, Y_test, batch_size=32)
You need to use tokenizer.fit_on_texts(texts)
before using tokenizer.texts_to_matrix()
here texts
is the list of the the text data (both train and test).
fit_on_texts()
uses it to build word_index
. Its nothing but unique word to number mapping. And this mapping is later used to generate the matrix.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With