Skip to content
This repository was archived by the owner on Oct 31, 2023. It is now read-only.
This repository was archived by the owner on Oct 31, 2023. It is now read-only.

Same sentence, different encoding! #141

@MarcusNerva

Description

@MarcusNerva

`
import sys
sys.path.append('../')
import os
import torch
import math
import numpy as np

from infersent_model import InferSent
EPS = 1e-4

def cosine(u, v):
return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

if name == 'main':
# opt = myopts.parse_opt()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_version = 1
MODEL_PATH = './encoder'
assert MODEL_PATH is not None, '--infersent_model_path is None!'
MODEL_PATH = os.path.join(MODEL_PATH, 'infersent%s.pkl' % model_version)
params_model = {
    'bsize': 64,
    'word_emb_dim': 300,
    'enc_lstm_dim': 2048,
    'pool_type': 'max',
    'dpout_model': 0.0,
    'version': model_version
}
infersent_model = InferSent(params_model)
infersent_model.load_state_dict(torch.load(MODEL_PATH))
infersent_model = infersent_model.to(device)
W2V_PATH = './Glove/glove.840B.300d.txt'
assert W2V_PATH is not None, '--w2v_path is None!'
infersent_model.set_w2v_path(W2V_PATH)
infersent_model.build_vocab_k_words(K=100000)

store = ['a man is talking about a movie pictures of a movie pictures' ,
         'a person is folding paper',
         'a man is singing',
         'people are dancing and dancing',
         'a man and woman are talking about something',
         'a woman is applying makeup',
         'a person is cooking a dish and adding ingredients into a pot',
         'a man is talking',
         'a man is talking about the weather on the screen',
         'cartoon characters are interacting']
# encoding sentences together
embeddings = infersent_model.encode(store, bsize=128, tokenize=True)

for i in range(len(store)):
    # encoding ith sentence alone
    temp = infersent_model.encode([store[i]], bsize=128, tokenize=True)[0]
    # calculate Cosine Similarity between ith sentence which is encoded alone 
    # and ith sentence which is encoded together with others sentences
    if math.fabs(1 - cosine(temp, embeddings[i])) > EPS:
        print(cosine(temp, embeddings[i]))

`

and here is the output:
Vocab size : 100000
0.9066778
0.87379414
0.89509517
0.9344797
0.9010086
0.8247624
0.9670602
0.9080478

really weird, isn't it?
Since all parameters are frozen, how could this happen?

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions