728x90

2024-01-09 78th Class

Word2Vec Modeling

모델 저장 및 불러오기

# 저장한 모델 불러오기
from gensim.models import KeyedVectors
loaded_model = KeyedVectors.load_word2vec_format('Word2Vec.txt')
print(loaded_model.most_similar('노잼'))

'''
[('핵', 0.7714633941650391), ('노답', 0.7687075138092041), ('깜놀', 0.743700385093689), ('꿀잼', 0.7122993469238281), ('개핵', 0.707148551940918), ('개꿀잼', 0.6943026185035706), ('우와', 0.6871969103813171), ('레알', 0.6695948243141174), ('급전', 0.6631433367729187), ('ㅋㅋㅋㅋㅋㅋㅋ', 0.6627783179283142)]
'''

vocab 저장 & 단어 벡터 저장

vocab = list(loaded_model.key_to_index)
word_vectors = loaded_model[vocab]
# (vocab size, embedding size)
word_vectors.shape
# (16470, 100)

word vector 차원 축소

from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X = pca.fit_transform(word_vectors)
X.shape
# (16470, 2)

시각화

import pandas as pd
df = pd.DataFrame(X, columns = ['com1', 'com2'])
df['word'] = vocab
df.head()
구분 com1 com2 word
0 2.281575 0.286518 영화
1 -0.861905 1.258804 보다
2 4.666112 3.470907
3 3.230385 -0.334965 없다
4 0.975042 -0.108117 이다
import plotly.express as px
fig = px.scatter(df.head(1000), x = 'com1', y = 'com2', text = 'word')
fig.update_traces(textposition = 'top center')
fig.show()

newplot.png

반응형