!pip install -q kaggle
from google.colab import drive, files
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import zipfile
from sklearn import model_selection
drive.mount('/content/gdrive')
import os
import pathlib
# Upload the API token.
def get_kaggle():
try:
import kaggle
return kaggle
except OSError:
pass
token_file = pathlib.Path("~/.kaggle/kaggle.json").expanduser()
token_file.parent.mkdir(exist_ok=True, parents=True)
try:
from google.colab import files
except ImportError:
raise ValueError("Could not find kaggle token.")
uploaded = files.upload()
token_content = uploaded.get('kaggle.json', None)
if token_content:
token_file.write_bytes(token_content)
token_file.chmod(0o600)
else:
raise ValueError('Need a file named "kaggle.json"')
import kaggle
return kaggle
kaggle = get_kaggle()
# Download data from Kaggle and create a DataFrame.
def load_data_from_zip(path):
with zipfile.ZipFile(path, "r") as zip_ref:
name = zip_ref.namelist()[0]
with zip_ref.open(name) as zf:
return pd.read_csv(zf)
# The data does not come with a validation set so we'll create one from the
# training set.
def get_data(competition, train_file, test_file, validation_set_ratio=0.1):
data_path = pathlib.Path("data")
kaggle.api.competition_download_files(competition, data_path)
competition_path = (data_path/competition)
competition_path.mkdir(exist_ok=True, parents=True)
competition_zip_path = competition_path.with_suffix(".zip")
with zipfile.ZipFile(competition_zip_path, "r") as zip_ref:
zip_ref.extractall(competition_path)
train_df = load_data_from_zip(competition_path/train_file)
test_df = load_data_from_zip(competition_path/test_file)
# We split by sentence ids, because we don't want to have phrases belonging
# to the same sentence in both training and validation set.
train_df, validation_df = model_selection.train_test_split(
train_df,
test_size=0.3,
random_state=0)
print("Split the training data into %d training and %d validation examples." %
(len(train_df), len(validation_df)))
return train_df, validation_df, test_df
train_df, validation_df, test_df = get_data(
"jigsaw-toxic-comment-classification-challenge",
"train.csv.zip", "test.csv.zip")
test_df.head(5)
train_df.head(5)
graph_df = train_df['toxic'].value_counts().to_frame()\
.join(train_df['severe_toxic'].value_counts().to_frame())\
.join(train_df['obscene'].value_counts().to_frame())\
.join(train_df['threat'].value_counts().to_frame())\
.join(train_df['insult'].value_counts().to_frame())\
.join(train_df['identity_hate'].value_counts().to_frame())\
graph_df.plot(kind='bar',figsize=(12, 6))
x_train = train_df['comment_text']
y_train = train_df.iloc[:, 2:]
x_val = validation_df['comment_text']
y_val = validation_df.iloc[:, 2:]
x_train.head()
y_train.head()
tf.keras.losses.BinaryCrossentropy
.The Universal Sentence Encoder encodes text into high dimensional vectors that can be used for text classification, semantic similarity, clustering, and other natural language tasks. The pre-trained Universal Sentence Encoder is publicly available in Tensorflow-hub.
class MyModel(tf.keras.Model):
def __init__(self, hub_url):
super().__init__()
self.hub_url = hub_url
self.embed = hub.load(self.hub_url)
self.sequential = tf.keras.Sequential([
tf.keras.layers.Dense(500),
tf.keras.layers.Dense(100),
tf.keras.layers.Dense(6),
tf.keras.layers.Activation('sigmoid')
])
def call(self, inputs):
inputs = tf.reshape(inputs, shape=[-1])
embedding = 6 * self.embed(inputs)
return self.sequential(embedding)
def get_config(self):
return {"hub_url":self.hub_url}
model = MyModel("https://tfhub.dev/google/nnlm-en-dim128/2")
model.compile(
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True),
optimizer=tf.optimizers.Adam(),
metrics = [tf.keras.metrics.BinaryCrossentropy()])
history = model.fit(x=x_train, y=y_train,
validation_data=(x_val, y_val),
epochs = 25)
test_df['comment_text'][1]
test_df['comment_text']
y_test = model.predict(test_df['comment_text'])
y_test
result_df = pd.DataFrame(y_test, columns=y_train.columns)
result_df
test_df['id']
result_df.insert(0, 'id', test_df['id'])
result_df
result_df.to_csv("submission.csv", index=False)
len(y_test)
model.predict(["Mother fucker"])
model.predict(["Starting a new AI project there are lots of things to consider, and getting a proof-of-concept going with whatever-bunch-of-tools-you-come-across is, probably, the way to go. But once this phase of the project is over you’ll need to think engineering!"])
model.save_weights('my_model.h5')