행위

Chatgpt 텍스트 유사도 비교

DB CAFE

thumb_up 추천메뉴 바로가기


1 BERT를 이용한 유사도 비교[편집]

!pip install transformers

import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import pairwise_distances

# Load pre-trained BERT model and tokenizer
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load STS-B dataset
train_data = torch.utils.data.TensorDataset(torch.tensor(train_input_ids),
                                            torch.tensor(train_attention_masks),
                                            torch.tensor(train_labels))
train_dataloader = DataLoader(train_data, batch_size=16)

# Fine-tune the BERT model on STS-B dataset
optimizer = AdamW(model.parameters(), lr=2e-5)
model.train()
for epoch in range(3):
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids, attention_masks, labels = batch
        loss, _ = model(input_ids=input_ids, attention_mask=attention_masks, labels=labels)
        loss.backward()
        optimizer.step()

# Compare the similarity between two text documents using the fine-tuned BERT model
document1 = "The cat is sitting on the mat"
document2 = "The dog is lying on the rug"
inputs = tokenizer.encode_plus(document1, document2, add_special_tokens=True, return_tensors='pt')
input_ids = inputs['input_ids'].to(device)
attention_mask = inputs['attention_mask'].to(device)
with torch.no_grad():
    embeddings = model.bert(input_ids=input_ids, attention_mask=attention_mask)[1]
embeddings = embeddings.cpu().numpy()
similarity = pairwise_distances(embeddings, metric='cosine')[0, 1]
print("Similarity score:", 1 - similarity)

2 GPT-2를 이용한 유사도 비교[편집]

!pip install transformers

import torch
from transformers import GPT2Tokenizer, GPT2Model
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained GPT-2 model and tokenizer
model = GPT2Model.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Encode two text documents
document1 = "The cat is sitting on the mat"
document2 = "The dog is lying on the rug"
inputs = tokenizer.encode_plus(document1, document2, add_special_tokens=True, return_tensors='pt')

# Generate embeddings for each document using the pre-trained GPT-2 model
with torch.no_grad():
    outputs = model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Calculate the cosine similarity between the two embeddings
similarity = cosine_similarity([embeddings[0]], [embeddings[1]])
print("Similarity score:", similarity[0][0])