대상 문단
‘A new study estimates that if the US had universally mandated masks on 1 April, there could have been nearly 40% fewer deaths by the start of June. Containment policies had a large impact on the number of COVID-19 cases and deaths, directly by reducing transmission rates and indirectly by constraining people’s behaviour. They account for roughly half the observed change in the growth rates of cases and deaths.’
질의 응답
from transformers import pipeline
= pipeline('question-answering',
qNa ='bert-large-cased-whole-word-masking-finetuned-squad',
model='bert-large-cased-whole-word-masking-finetuned-squad')
tokenizer
= 'A new study estimates that if the US had universally mandated masks on 1 April, there could have been nearly 40% fewer deaths by the start of June. Containment policies had a large impact on the number of COVID-19 cases and deaths, directly by reducing transmission rates and indirectly by constraining people’s behaviour. They account for roughly half the observed change in the growth rates of cases and deaths.'
paragraph
= qNa({'question': 'Which country is this article about?',
ans 'context': f'{paragraph}'})
print(ans)
# {'score': 0.795813262462616, 'start': 34, 'end': 36, 'answer': 'US'}
= qNa({'question': 'Which disease is discussed in this article?',
ans 'context': f'{paragraph}'})
print(ans)
# {'score': 0.9766002893447876, 'start': 205, 'end': 213, 'answer': 'COVID-19'}
# https://stackoverflow.com/questions/53424164/color-highlighting-text-in-r-for-a-pre-defined-list-of-words
library(reticulate)
library(tidyverse)
library(crayon)
<- 'A new study estimates that if the US had universally mandated masks on 1 April, there could have been nearly 40% fewer deaths by the start of June. Containment policies had a large impact on the number of COVID-19 cases and deaths, directly by reducing transmission rates and indirectly by constraining people’s behaviour. They account for roughly half the observed change in the growth rates of cases and deaths.'
paragraph
<- function(x) {
unique_words ::map(.x = x,
purrr.f = ~ unique(base::strsplit(x = ., split = " ")[[1]],
collapse = " "))
}
<- tibble::enframe(unique_words(x = paragraph)) %>%
df ::unnest() %>%
tidyr::mutate(.data = .,
dplyr# value2 = dplyr::case_when(value == py$ans$answer ~ crayon::red(value),
value2 = dplyr::case_when(value == "US" ~ crayon::red(value),
== "COVID-19" ~ crayon::blue(value),
value TRUE ~ value)) %>%
::select(., -value)
dplyr
print(cat(df$value2))
A new study estimates that if the US had universally mandated masks on 1 April, there could have been nearly 40% fewer deaths by start of June. Containment policies a large impact number COVID-19 cases and deaths, directly reducing transmission rates indirectly constraining people’s behaviour. They account for roughly half observed change in growth deaths.NULL
= pipeline("fill-mask",
fill_mask ="bert-base-cased",
model="bert-base-cased")
tokenizer
= fill_mask("A new study estimates that if the US had universally mandated masks on 1 April, there could have been nearly 40% fewer [MASK] by the start of June")
mask_output
mask_output
# [{'score': 0.19625626504421234, 'token': 6209, 'token_str': 'deaths', 'sequence': 'A new study estimates that if the US had universally mandated masks on 1 April, there could have been nearly 40 % fewer deaths by the start of June'}, {'score': 0.11479492485523224, 'token': 26107, 'token_str': 'executions', 'sequence': 'A new study estimates that if the US had universally mandated masks on 1 April, there could have been nearly 40 % fewer executions by the start of June'}, {'score': 0.08466506004333496, 'token': 5256, 'token_str': 'victims', 'sequence': 'A new study estimates that if the US had universally mandated masks on 1 April, there could have been nearly 40 % fewer victims by the start of June'}, {'score': 0.04194879159331322, 'token': 17944, 'token_str': 'masks', 'sequence': 'A new study estimates that if the US had universally mandated masks on 1 April, there could have been nearly 40 % fewer masks by the start of June'}, {'score': 0.027420325204730034, 'token': 19189, 'token_str': 'arrests', 'sequence': 'A new study estimates that if the US had universally mandated masks on 1 April, there could have been nearly 40 % fewer arrests by the start of June'}]
library(tidyverse)
library(reticulate)
library(reactable)
# mask_tbl <- map_df(py$mask_output, bind_rows)
#
# mask_tbl %>%
# write_rds("data/fill_mask.rds")
<-
mask_tbl read_rds("data/fill_mask.rds")
<- mask_tbl$token_str
tokens
<- mask_tbl %>%
mask_color_tbl ::unnest_tokens(word, sequence) %>%
tidytext::mutate(text = dplyr::case_when(word %in% tokens ~ crayon::red(word),
dplyrTRUE ~ word)) %>%
group_by(score, token, token_str) %>%
summarize(text = str_c(text, collapse = " "))
%>%
mask_color_tbl ::reactable( defaultColDef = colDef(
reactableheader = function(value) gsub(".", " ", value, fixed = TRUE),
cell = function(value) format(value, nsmall = 1),
align = "center",
minWidth = 70,
headerStyle = list(background = "#f7f7f8")
),columns = list(
score = colDef(minWidth = 50),
token = colDef(minWidth = 50),
token_str = colDef(minWidth = 50),
text = colDef(minWidth = 140)
),bordered = TRUE,
highlight = TRUE
)
print(cat(paste0(mask_color_tbl$text, "\n\n")) )
a new study estimates that if the us had universally mandated masks on 1 april there could have been nearly 40 fewer arrests by the start of june
a new study estimates that if the us had universally mandated masks on 1 april there could have been nearly 40 fewer masks by the start of june
a new study estimates that if the us had universally mandated masks on 1 april there could have been nearly 40 fewer victims by the start of june
a new study estimates that if the us had universally mandated masks on 1 april there could have been nearly 40 fewer executions by the start of june
a new study estimates that if the us had universally mandated masks on 1 april there could have been nearly 40 fewer deaths by the start of june
NULL
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import sklearn
import re
import string
import seaborn as sns
= pd.read_csv('data/twitter/train.csv')
train_data
'id','keyword','location'],axis=1,inplace=True)
train_data.drop([
train_data.head()
= train_data.drop('target',axis=1)
X = train_data['target'] y
from sklearn.model_selection import train_test_split
=train_test_split(corpus,y,test_size=0.2,random_state=42) X_train, X_test, y_train,y_test
from transformers import DistilBertTokenizerFast
= DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
tokenizer
def tokenize_text(data):
= tokenizer(data, padding=True, truncation=True, return_tensors='np')
encoded return encoded.data
=tokenize_text(X_train)
train_data=tokenize_text(X_test) test_data
from transformers import TFDistilBertForSequenceClassification
= TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',
model = 2)
num_labels
from tensorflow.keras.losses import SparseCategoricalCrossentropy
import tensorflow as tf
= tf.keras.optimizers.Adam(learning_rate=5e-5)
optimizer
compile(optimizer = optimizer,
model.= tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
loss = ['accuracy'])
metrics
model.fit(
train_data,
np.array(y_train),=(
validation_data
test_data,
np.array(y_test),
),=32, epochs=1
batch_size )
'data/twitter_bert_80') model.save_pretrained(
from transformers import TFDistilBertForSequenceClassification
= TFDistilBertForSequenceClassification.from_pretrained('data/twitter_bert_80')
tf_model
= tf_model.predict(test_data)
preds
# 48/48 [==============================] - 44s 829ms/step
= np.argmax(preds['logits'],axis=1)
classes
from sklearn import metrics
metrics.accuracy_score(classes, y_test)# 0.8023637557452397
metrics.confusion_matrix(classes,y_test)
# array([[691, 118],
# [183, 531]], dtype=int64)