pretrained.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
  1. import os
  2. import sys
  3. import numpy as np
  4. import tensorflow as tf
  5. from tensorflow import keras
  6. from tensorflow.keras import layers
  7. from tensorflow.keras.preprocessing.text import Tokenizer
  8. from tensorflow.keras.preprocessing.sequence import pad_sequences
  9. BASE_DIR = ''
  10. GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B')
  11. TEXT_DATA_DIR = os.path.join(BASE_DIR, '20_newsgroup')
  12. MAX_SEQUENCE_LENGTH = 1000
  13. MAX_NUM_WORDS = 20000
  14. EMBEDDING_DIM = 100
  15. VALIDATION_SPLIT = 0.2
  16. # first, build index mapping words in the embeddings set
  17. # to their embedding vector
  18. print('Indexing word vectors.')
  19. embeddings_index = {}
  20. with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
  21. for line in f:
  22. values = line.split()
  23. word = values[0]
  24. coefs = np.asarray(values[1:], dtype='float32')
  25. embeddings_index[word] = coefs
  26. print('Found %s word vectors.' % len(embeddings_index))
  27. # second, prepare text samples and their labels
  28. print('Processing text dataset')
  29. texts = [] # list of text samples
  30. labels_index = {} # dictionary mapping label name to numeric id
  31. labels = [] # list of label ids
  32. for name in sorted(os.listdir(TEXT_DATA_DIR)):
  33. path = os.path.join(TEXT_DATA_DIR, name)
  34. if os.path.isdir(path):
  35. label_id = len(labels_index)
  36. labels_index[name] = label_id
  37. for fname in sorted(os.listdir(path)):
  38. if fname.isdigit():
  39. fpath = os.path.join(path, fname)
  40. args = {} if sys.version_info < (3,) else {'encoding': 'latin-1'}
  41. with open(fpath, **args) as f:
  42. t = f.read()
  43. i = t.find('\n\n') # skip header
  44. if 0 < i:
  45. t = t[i:]
  46. texts.append(t)
  47. labels.append(label_id)
  48. print('Found %s texts.' % len(texts))
  49. # finally, vectorize the text samples into a 2D integer tensor
  50. tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
  51. tokenizer.fit_on_texts(texts)
  52. sequences = tokenizer.texts_to_sequences(texts)
  53. word_index = tokenizer.word_index
  54. print('Found %s unique tokens.' % len(word_index))
  55. data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
  56. labels = to_categorical(np.asarray(labels))
  57. print('Shape of data tensor:', data.shape)
  58. print('Shape of label tensor:', labels.shape)
  59. # split the data into a training set and a validation set
  60. indices = np.arange(data.shape[0])
  61. np.random.shuffle(indices)
  62. data = data[indices]
  63. labels = labels[indices]
  64. num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
  65. x_train = data[:-num_validation_samples]
  66. y_train = labels[:-num_validation_samples]
  67. x_val = data[-num_validation_samples:]
  68. y_val = labels[-num_validation_samples:]
  69. print('Preparing embedding matrix.')
  70. # prepare embedding matrix
  71. num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
  72. embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
  73. for word, i in word_index.items():
  74. if i > MAX_NUM_WORDS:
  75. continue
  76. embedding_vector = embeddings_index.get(word)
  77. if embedding_vector is not None:
  78. # words not found in embedding index will be all-zeros.
  79. embedding_matrix[i] = embedding_vector
  80. # load pre-trained word embeddings into an Embedding layer
  81. # note that we set trainable = False so as to keep the embeddings fixed
  82. embedding_layer = Embedding(num_words,
  83. EMBEDDING_DIM,
  84. embeddings_initializer=Constant(embedding_matrix),
  85. input_length=MAX_SEQUENCE_LENGTH,
  86. trainable=False)
  87. print('Training model.')
  88. # train a 1D convnet with global maxpooling
  89. sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
  90. embedded_sequences = embedding_layer(sequence_input)
  91. x = Conv1D(128, 5, activation='relu')(embedded_sequences)
  92. x = MaxPooling1D(5)(x)
  93. x = Conv1D(128, 5, activation='relu')(x)
  94. x = MaxPooling1D(5)(x)
  95. x = Conv1D(128, 5, activation='relu')(x)
  96. x = GlobalMaxPooling1D()(x)
  97. x = Dense(128, activation='relu')(x)
  98. preds = Dense(len(labels_index), activation='softmax')(x)
  99. model = Model(sequence_input, preds)
  100. model.compile(loss='categorical_crossentropy',
  101. optimizer='rmsprop',
  102. metrics=['acc'])
  103. model.fit(x_train, y_train,
  104. batch_size=128,
  105. epochs=10,
  106. validation_data=(x_val, y_val))