HTMLcontentnonenoneTherethere

降重资讯 admin 浏览

小编:



HTMLcontentnonenoneTherethere

nonenoneThere is no one <class 'RuntimeError'> at /

<class 'RuntimeError'> at /

cannot reshape tensor of 0 elements into shape [-1, 0] because the unspecified dimension size -1 can be any value and is ambiguous

Python /usr/local/lib/python3.6/dist-packages/transformers/modeling_gpt2.py in forward, line 372
Web POST http://192.168.1.100:8080/

Traceback (innermost first)

  • /usr/local/lib/python3.6/dist-packages/transformers/modeling_gpt2.py in forward
    1. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
    2. """
    3. for layer, heads in heads_to_prune.items():
    4. self.h[layer].attn.prune_heads(heads)
    5. def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
    6. input_shape = input_ids.size()
    1. input_ids = input_ids.view(-1, input_shape[-1]) ...
    1. if token_type_ids is not None:
    2. token_type_ids = token_type_ids.view(-1, input_shape[-1])
    3. if position_ids is not None:
    4. position_ids = position_ids.view(-1, input_shape[-1])
    5. if past is None:
    VariableValue
    attention_mask
    None
    head_mask
    None
    input_ids
    tensor([], device='cuda:0', size=(1, 0), dtype=torch.int64)
    input_shape
    torch.Size([1, 0])
    past
    None
    position_ids
    None
    self
    GPT2Model( (wte): Embedding(13317, 768) (wpe): Embedding(1024, 768) (drop): Dropout(p=0.1, inplace=False) (h): ModuleList( (0): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (1): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (2): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (3): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (4): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (5): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (6): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (7): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (8): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (9): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (10): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (11): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (12): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (13): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (14): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (15): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (16): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (17): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (18): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (19): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (20): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (21): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (22): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (23): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) )
    token_type_ids
    None
  • /usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in __call__
    1. if result is not None:
    2. if not isinstance(result, tuple):
    3. result = (result,)
    4. input = result
    5. if torch._C._get_tracing_state():
    6. result = self._slow_forward(*input, **kwargs)
    7. else:
    1. result = self.forward(*input, **kwargs) ...
    1. for hook in self._forward_hooks.values():
    2. hook_result = hook(self, input, result)
    3. if hook_result is not None:
    4. result = hook_result
    5. if len(self._backward_hooks) > 0:
    6. var = result
    VariableValue
    input
    (tensor([], device='cuda:0', size=(1, 0), dtype=torch.int64),)
    kwargs
    {'attention_mask': None, 'head_mask': None, 'past': None, 'position_ids': None, 'token_type_ids': None}
    self
    GPT2Model( (wte): Embedding(13317, 768) (wpe): Embedding(1024, 768) (drop): Dropout(p=0.1, inplace=False) (h): ModuleList( (0): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (1): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (2): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (3): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (4): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (5): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (6): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (7): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (8): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (9): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (10): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (11): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (12): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (13): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (14): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (15): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (16): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (17): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (18): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (19): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (20): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (21): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (22): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (23): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) )
  • /usr/local/lib/python3.6/dist-packages/transformers/modeling_gpt2.py in forward
    1. def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
    2. labels=None):
    3. transformer_outputs = self.transformer(input_ids,
    4. past=past,
    5. attention_mask=attention_mask,
    6. token_type_ids=token_type_ids,
    7. position_ids=position_ids,
    1. head_mask=head_mask) ...
    1. hidden_states = transformer_outputs[0]
    2. lm_logits = self.lm_head(hidden_states)
    3. outputs = (lm_logits,) + transformer_outputs[1:]
    4. if labels is not None:
    VariableValue
    attention_mask
    None
    head_mask
    None
    input_ids
    tensor([], device='cuda:0', size=(1, 0), dtype=torch.int64)
    labels
    None
    past
    None
    position_ids
    None
    self
    GPT2LMHeadModel( (transformer): GPT2Model( (wte): Embedding(13317, 768) (wpe): Embedding(1024, 768) (drop): Dropout(p=0.1, inplace=False) (h): ModuleList( (0): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (1): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (2): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (3): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (4): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (5): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (6): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (7): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (8): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (9): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (10): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (11): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (12): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (13): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (14): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (15): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (16): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (17): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (18): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (19): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (20): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (21): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (22): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (23): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) ) (lm_head): Linear(in_features=768, out_features=13317, bias=False) )
    token_type_ids
    None
  • /usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in __call__
    1. if result is not None:
    2. if not isinstance(result, tuple):
    3. result = (result,)
    4. input = result
    5. if torch._C._get_tracing_state():
    6. result = self._slow_forward(*input, **kwargs)
    7. else:
    1. result = self.forward(*input, **kwargs) ...
    1. for hook in self._forward_hooks.values():
    2. hook_result = hook(self, input, result)
    3. if hook_result is not None:
    4. result = hook_result
    5. if len(self._backward_hooks) > 0:
    6. var = result
    VariableValue
    input
    ()
    kwargs
    {'input_ids': tensor([], device='cuda:0', size=(1, 0), dtype=torch.int64)}
    self
    GPT2LMHeadModel( (transformer): GPT2Model( (wte): Embedding(13317, 768) (wpe): Embedding(1024, 768) (drop): Dropout(p=0.1, inplace=False) (h): ModuleList( (0): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (1): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (2): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (3): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (4): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (5): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (6): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (7): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (8): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (9): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (10): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (11): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (12): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (13): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (14): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (15): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (16): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (17): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (18): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (19): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (20): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (21): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (22): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) (23): Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (c_fc): Conv1D() (c_proj): Conv1D() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) ) (lm_head): Linear(in_features=768, out_features=13317, bias=False) )
  • /root/gptc/generate_api_gpu_one_sen.py in sample_sequence
    1. context = torch.tensor(context, dtype=torch.long, device=device)
    2. context = context.unsqueeze(0)
    3. generated = context
    4. with torch.no_grad():
    5. for _ in trange(length):
    6. inputs = {'input_ids': generated[0][-(n_ctx - 1):].unsqueeze(0)}
    7. 你可能喜欢的: