import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, Model
class TransformerBlock(layers.Layer):
def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
super(TransformerBlock, self).__init__()
self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
self.ffn = tf.keras.Sequential([
layers.Dense(ff_dim, activation='gelu'),
layers.Dense(embed_dim)
])
self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = layers.Dropout(rate)
self.dropout2 = layers.Dropout(rate)
def call(self, inputs, training, mask=None):
attn_output = self.att(inputs, inputs, attention_mask=mask)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(inputs + attn_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
return self.layernorm2(out1 + ffn_output)
class TokenAndPositionEmbedding(layers.Layer):
def __init__(self, maxlen, vocab_size, embed_dim):
super(TokenAndPositionEmbedding, self).__init__()
self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
def call(self, x):
maxlen = tf.shape(x)[-1]
positions = tf.range(start=0, limit=maxlen, delta=1)
positions = self.pos_emb(positions)
return x + positions
class TransformerModel(Model):
def __init__(self, vocab_size, maxlen, embed_dim=768, num_heads=12, ff_dim=3072, num_layers=12):
super(TransformerModel, self).__init__()
self.embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
self.transformer_blocks = [TransformerBlock(embed_dim, num_heads, ff_dim) for _ in range(num_layers)]
self.layernorm = layers.LayerNormalization(epsilon=1e-6)
self.dropout = layers.Dropout(0.1)
self.final_dense = layers.Dense(vocab_size)
def call(self, inputs, training=False):
x = self.embedding_layer(inputs)
for transformer_block in self.transformer_blocks:
x = transformer_block(x, training)
x = self.layernorm(x)
x = self.dropout(x, training=training)
return self.final_dense(x)
基于OpenAI团队GPT-2的Tensorlfow-1.X实现代码,昨天用Keras最新版里面重写了一遍。我计划用Keras自行训练GPT-2。因为我计划的模型参数量小,我将发布三个针对不同领域的模型。该模型目前暂命名Gallus。
你可以怎样帮到Gallus
1.帮我找点数据吧各位。我来洛谷发帖就是希望有大佬提供数据。包括但不限于代码数据、文学性文本数据、神帖文本数据。
2.在我每次公布代码时帮我改进代码。Gallus是开源项目,允许所有人无偿改进与使用它。
Gallus完全体什么时候能发布
Gallus-1的代码在今年暑假前全部开源,数据集暑假内开源,模型权重文件开源不做保证。因为我真搞不到高性能GPU。Gallus最小参数量的版本都有124M参数,最大的我计划应该有7B。训练124M大小的模型大约需要一张V100 GPU,训练7B按照业界经验就应该需要8张V100 GPU。我搞不到这类GPU,有神人可以用我的代码训练开源吗?