小编:

nonenoneThere is no one
<class 'RuntimeError'> at /
cannot reshape tensor of 0 elements into shape [-1, 0] because the unspecified dimension size -1 can be any value and is ambiguous
Python | /usr/local/lib/python3.6/dist-packages/transformers/modeling_gpt2.py in forward, line 372 |
---|---|
Web | POST http://192.168.1.100:8080/ |
Traceback (innermost first)
-
/usr/local/lib/python3.6/dist-packages/transformers/modeling_gpt2.py
inforward
- heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
- """
- for layer, heads in heads_to_prune.items():
- self.h[layer].attn.prune_heads(heads)
- def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
- input_shape = input_ids.size()
- input_ids = input_ids.view(-1, input_shape[-1]) ...
- if token_type_ids is not None:
- token_type_ids = token_type_ids.view(-1, input_shape[-1])
- if position_ids is not None:
- position_ids = position_ids.view(-1, input_shape[-1])
- if past is None:
Variable Value attention_mask Nonehead_mask Noneinput_ids tensor([], device='cuda:0', size=(1, 0), dtype=torch.int64)input_shape torch.Size([1, 0])past Noneposition_ids Noneself GPT2Model( (wte): Embedding(13317, 768) (wpe): Embedding(1024, 768) (drop): Dropout(p=0.1, inplace=False) (h): ModuleList( (0): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (1): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (2): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (3): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (4): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (5): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (6): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (7): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (8): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (9): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (10): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (11): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (12): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (13): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (14): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (15): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (16): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (17): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (18): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (19): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (20): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (21): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (22): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (23): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) )token_type_ids None -
/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py
in__call__
- if result is not None:
- if not isinstance(result, tuple):
- result = (result,)
- input = result
- if torch._C._get_tracing_state():
- result = self._slow_forward(*input, **kwargs)
- else:
- result = self.forward(*input, **kwargs) ...
- for hook in self._forward_hooks.values():
- hook_result = hook(self, input, result)
- if hook_result is not None:
- result = hook_result
- if len(self._backward_hooks) > 0:
- var = result
Variable Value input (tensor([], device='cuda:0', size=(1, 0), dtype=torch.int64),)kwargs {'attention_mask': None, 'head_mask': None, 'past': None, 'position_ids': None, 'token_type_ids': None}self GPT2Model( (wte): Embedding(13317, 768) (wpe): Embedding(1024, 768) (drop): Dropout(p=0.1, inplace=False) (h): ModuleList( (0): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (1): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (2): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (3): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (4): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (5): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (6): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (7): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (8): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (9): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (10): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (11): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (12): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (13): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (14): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (15): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (16): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (17): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (18): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (19): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (20): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (21): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (22): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (23): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) ) -
/usr/local/lib/python3.6/dist-packages/transformers/modeling_gpt2.py
inforward
- def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
- labels=None):
- transformer_outputs = self.transformer(input_ids,
- past=past,
- attention_mask=attention_mask,
- token_type_ids=token_type_ids,
- position_ids=position_ids,
- head_mask=head_mask) ...
- hidden_states = transformer_outputs[0]
- lm_logits = self.lm_head(hidden_states)
- outputs = (lm_logits,) + transformer_outputs[1:]
- if labels is not None:
Variable Value attention_mask Nonehead_mask Noneinput_ids tensor([], device='cuda:0', size=(1, 0), dtype=torch.int64)labels Nonepast Noneposition_ids Noneself GPT2LMHeadModel( (transformer): GPT2Model( (wte): Embedding(13317, 768) (wpe): Embedding(1024, 768) (drop): Dropout(p=0.1, inplace=False) (h): ModuleList( (0): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (1): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (2): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (3): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (4): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (5): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (6): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (7): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (8): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (9): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (10): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (11): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (12): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (13): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (14): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (15): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (16): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (17): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (18): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (19): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (20): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (21): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (22): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (23): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) ) (lm_head): Linear(in_features=768, out_features=13317, bias=False) )token_type_ids None -
/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py
in__call__
- if result is not None:
- if not isinstance(result, tuple):
- result = (result,)
- input = result
- if torch._C._get_tracing_state():
- result = self._slow_forward(*input, **kwargs)
- else:
- result = self.forward(*input, **kwargs) ...
- for hook in self._forward_hooks.values():
- hook_result = hook(self, input, result)
- if hook_result is not None:
- result = hook_result
- if len(self._backward_hooks) > 0:
- var = result
Variable Value input ()kwargs {'input_ids': tensor([], device='cuda:0', size=(1, 0), dtype=torch.int64)}self GPT2LMHeadModel( (transformer): GPT2Model( (wte): Embedding(13317, 768) (wpe): Embedding(1024, 768) (drop): Dropout(p=0.1, inplace=False) (h): ModuleList( (0): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (1): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (2): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (3): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (4): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (5): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (6): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (7): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (8): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (9): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (10): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (11): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (12): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (13): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (14): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (15): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (16): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (17): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (18): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (19): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (20): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (21): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (22): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (23): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) ) (lm_head): Linear(in_features=768, out_features=13317, bias=False) ) -
/root/gptc/generate_api_gpu_one_sen.py
insample_sequence
- context = torch.tensor(context, dtype=torch.long, device=device)
- context = context.unsqueeze(0)
- generated = context
- with torch.no_grad():
- for _ in trange(length):
- inputs = {'input_ids': generated[0][-(n_ctx - 1):].unsqueeze(0)}
-
你可能喜欢的: