import os
os.chdir('/home/dsuser/workspace/backend')


import joblib
from modules.model.text_classify import *


itr = "00"
# dataset_name = "aozora"
dataset_name = "ldcc"
pipe = joblib.load(f"data/model/pipe-jptokenizermecab_{dataset_name}set_iter{itr}.gz")
pipe

Pipeline(steps=[('tokenizer',
                 <modules.model.text_classify.JpTokenizerMeCab object at 0x7f6043b77130>),
                ('vectorizer',
                 TfidfVectorizer(lowercase=False,
                                 tokenizer=<function ident_tokener at 0x7f6044121a60>)),
                ('to_dence',
                 <modules.model.text_classify.SparsetoDense object at 0x7f5fdaf46e50>),
                ('embedder',
                 FeatureUnion(transformer_list=[('pca', PCA(n_components=16)),
                                                ('identity',
                                                 <modules.model.text_classify.Transer object at 0x7f5fda94a160>)])),
                ('classifier',
                 LGBMClassifier(importance_type='gain', num_class=9,
                                objective='softmax'))])


dataset = joblib.load(f"data/dataset/{dataset_name}set_iter{itr}.gz")


X_train, X_valid = dataset.get_data(do_split=True)
y_train, y_valid = dataset.get_labels(do_split=True)
print("X_train:", len(X_train), "records")
print("X_valid:", len(X_valid), "records")

X_train: 5156 records
X_valid: 2211 records


print("X_train sample:")
X_train[0]

X_train sample:

['大津いじめ問題、加害者少年の複雑家庭事情もネットは厳しい反応',
 '24日、Web版「女性自身」では、「大津中２いじめ自殺\u3000加害少年Bの家庭崩壊『離婚します』と母」と題した記事で、加害者とされる同級生グループのうち、一人の家が「家庭崩壊」状態であることを報じた。',
 '',
 '記事では、京都府宇治市の学校に転校したという加害者少年の家庭事情が詳細に書かれている。少年の父母は小学校に入る前に離婚してしまったが、二年ほど前に再婚していたという。しかし、同誌は、その頃から加害者の少年が荒れ始めるようになったと報じている。',
 '',
 'また、同級生の少年が自殺した後、加害者と母親は京都府宇治市に転居した上、再婚した母親は再び離婚し、現在は「家庭崩壊」状態にあるとしている。',
 '',
 'だが、ネット掲示板では「家庭不和だからといって、他人をいじめていいわけではない」「母子家庭など、日本に山ほどいる。言い訳にはならない」といった厳しい反応が多数を占めた。',
 '',
 '【関連記事】',
 '・加害者の少年達は他にもいじめをしていた／「週刊文春」報じる',
 '・越直美大津市長の謝罪受け入れへ\u3000遺族と２５日に初めて面会',
 '・埼玉・草加「飛び降り強要」被害生徒母親の保護要請に校長「うっかりしてた」',
 '・大津いじめ\u3000昨秋体育大会で「ミイラのようにぐるぐる巻き」粘着テープと在校生徒が証言',
 '',
 '【関連情報】',
 '・大津中２いじめ自殺\u3000加害少年Bの家庭崩壊「離婚します」と母（女性自身）']


# predict trainset
p_train = pipe.predict(X_train)
train_acc = accuracy_score(y_train, p_train)
print("train_acc:", train_acc)

train_acc: 1.0


# predict validset
prob_valid = pipe.predict_proba(X_valid)
print("prob_valid sample:", prob_valid[0])

prob_valid sample: [1.99797443e-07 3.32973509e-07 1.03544772e-07 4.32056982e-07
 7.52664464e-06 3.33991948e-07 7.83590579e-08 9.99990417e-01
 5.75763947e-07]


lgbm = pipe[-1]
p_valid = lgbm._le.inverse_transform(prob_valid.argmax(axis=1))
valid_acc = accuracy_score(y_valid, p_valid)
print("valid_acc:", valid_acc)

valid_acc: 0.9516056083220262


lgbm._le.inverse_transform(range(len(dataset.labelset)))

array(['dokujo-tsushin', 'it-life-hack', 'kaden-channel',
       'livedoor-homme', 'movie-enter', 'peachy', 'smax', 'sports-watch',
       'topic-news'], dtype='<U14')


dataset.labelset

{'dokujo-tsushin',
 'it-life-hack',
 'kaden-channel',
 'livedoor-homme',
 'movie-enter',
 'peachy',
 'smax',
 'sports-watch',
 'topic-news'}


import numpy
from sklearn import metrics


from matplotlib import pyplot

def plot_pr_curve_by_label(ax, lbl: str, t: numpy.ndarray, p: numpy.ndarray):
    fpr, tpr, __thresholds = metrics.roc_curve(t, p)
    auc = metrics.auc(fpr, tpr)
    # print(f"ROC Area / {lbl} : (AUC = {auc:.3f})")

    precision, recall, thresholds = metrics.precision_recall_curve(t, p)
    ap = metrics.average_precision_score(t, p)

    ax.step(recall, precision, color='g', alpha=0.2, where='post')
    ax.fill_between(recall, precision, step='post', alpha=0.2, color='g')

    ax.set_xlabel('Recall')
    ax.set_ylabel('Precision')
    ax.set_xlim([0.0, 1.05])
    ax.set_ylim([0.0, 1.0])
    ax.set_title(f'{lbl} : AP={ap:0.3f} / AUC={auc:0.3f}')


def plot_pr_curve(dataset, pipe, label_plotter):
    n_classes = len(dataset.labelset)
    n_rows = int(numpy.sqrt(n_classes))
    print("n_rows:", n_rows)

    delta = int(bool(n_classes % n_rows))
    n = (n_classes // n_rows) * n_rows + n_rows * delta
    n_cols = n // n_rows
    n_rows -= int(n == n_classes + n_cols)
    fig, axes = pyplot.subplots(nrows=n_rows, ncols=n_cols, figsize = (10, 10), squeeze=False, tight_layout=True)
    fig.suptitle('PR-Curves', fontsize=16)

    lgbm = pipe[-1]
    for idx, lbl in enumerate(dataset.labelset):
        idx_lbl = lgbm._le.transform([lbl])

        t = (numpy.array(y_valid) == lbl).astype(numpy.int32)
        p = pipe.predict_proba(X_valid)[:, idx_lbl]

        r = idx // n_cols
        c = idx % n_cols
        label_plotter(axes[r, c], lbl, t, p)


    # show plots
    fig.tight_layout()
    fig.show()


plot_pr_curve(dataset, pipe, label_plotter=plot_pr_curve_by_label)

n_rows: 3


def plot_pr_threshold_by_label(ax, lbl: str, t: numpy.ndarray, p: numpy.ndarray):
    precision, recall, thresholds = metrics.precision_recall_curve(t, p)

    ax.step(thresholds, precision[:-1], color='b', alpha=0.2, where='post')
    ax.fill_between(thresholds, precision[:-1], step='post', alpha=0.2, color='b')

    ax.step(thresholds, recall[:-1], color='r', alpha=0.2, where='post')
    ax.fill_between(thresholds, recall[:-1], step='post', alpha=0.2, color='r')

    ax.legend(["Precsion", "Recall"], loc='lower right')

    ax.set_xlabel('Threshold')
    ax.set_ylabel('Precision/Recall')
    ax.set_xlim([0.0, 1.05])
    ax.set_ylim([0.0, 1.0])
    ax.set_title(f'{lbl}')


plot_pr_curve(dataset, pipe, label_plotter=plot_pr_threshold_by_label)

n_rows: 3


tokenizer = pipe[0]
tokenizer

<modules.model.text_classify.JpTokenizerMeCab at 0x7f6043b77130>


from lime.lime_text import LimeTextExplainer
labels_indices = range(len(dataset.labelset))
labels = lgbm._le.inverse_transform(labels_indices)
explainer1 = LimeTextExplainer(class_names=labels, split_expression=lambda x: x.split(" "))


def make_feature_text1(idx: int) -> str:
    tokens = tokenizer.transform([X_valid[idx]])[0]
    feature = " ".join(tokens)
    return feature


from sklearn.pipeline import make_pipeline

pipe1 = make_pipeline(Splitter(), pipe[1:])
pipe1

Pipeline(steps=[('splitter',
                 <modules.model.text_classify.Splitter object at 0x7f5fd51bebe0>),
                ('pipeline',
                 Pipeline(steps=[('vectorizer',
                                  TfidfVectorizer(lowercase=False,
                                                  tokenizer=<function ident_tokener at 0x7f6044121a60>)),
                                 ('to_dence',
                                  <modules.model.text_classify.SparsetoDense object at 0x7f5fdaf46e50>),
                                 ('embedder',
                                  FeatureUnion(transformer_list=[('pca',
                                                                  PCA(n_components=16)),
                                                                 ('identity',
                                                                  <modules.model.text_classify.Transer object at 0x7f5fda94a160>)])),
                                 ('classifier',
                                  LGBMClassifier(importance_type='gain',
                                                 num_class=9,
                                                 objective='softmax'))]))])


from sklearn.linear_model import Ridge

class Explainer(object):
    def __init__(self, explainer, feature_maker, pipe):
        self.explainer = explainer
        self.feature_maker = feature_maker
        self.pipe = pipe

    def explain(self, indices_samples, n_pickup=2):
        model_regressor = Ridge(alpha=1, fit_intercept=True, random_state=rs)
        for idx in indices_samples[:n_pickup]:
            print("idx:", idx, f"actual: {y_valid[idx]}", f"prediction: {p_valid[idx]}")

            x = self.feature_maker(idx)
            print("x:", x[:64].strip(), "...")
            exp = self.explainer.explain_instance(x, self.pipe.predict_proba, num_features=10, top_labels=2, model_regressor=model_regressor)
            exp.show_in_notebook(text=True)
        return


explr = Explainer(explainer1, make_feature_text1, pipe1)


rs = numpy.random.RandomState(1234567)


indices_samples_correct = numpy.arange(len(y_valid))[y_valid == p_valid]
indices_samples_correct = rs.permutation(indices_samples_correct)


## 正判定のテキストの説明
explr.explain(indices_samples_correct)

idx: 1223 actual: movie-enter prediction: movie-enter
x: アバター 抜く 最強 ヒーロー 勢揃い 日本版 オリジナル ポスター 公開 全世界 公開 さ れる わずか 10日間 興行収入 ...

idx: 183 actual: smax prediction: smax
x: こんな アプリ あっ の スマートフォン マウス なる ある 便利 Advanced Touchpad リモート マウス And ...


indices_samples_wrong = numpy.arange(len(y_valid))[y_valid != p_valid]
indices_samples_wrong = rs.permutation(indices_samples_wrong)


## 誤判定のテキストの説明
explr.explain(indices_samples_wrong)

idx: 2155 actual: livedoor-homme prediction: kaden-channel
x: 体 当てれ 水分 取れる 話題 マイクロフィバータオル 人気 ブランド リリース こすら 体 当てる 水分 取れる 濡れ 髪 体 ...

idx: 614 actual: livedoor-homme prediction: movie-enter
x: 快挙 NHKスペシャル 国際エミー賞 受賞 世界 優れ テレビ番組 贈ら れる 国際エミー賞 時事問題 部門 南米 チリ 鉱山事 ...


pipe2 = make_pipeline(Splitter(sep=" "), pipe)
pipe2

Pipeline(steps=[('splitter',
                 <modules.model.text_classify.Splitter object at 0x7f5fd51bef70>),
                ('pipeline',
                 Pipeline(steps=[('tokenizer',
                                  <modules.model.text_classify.JpTokenizerMeCab object at 0x7f6043b77130>),
                                 ('vectorizer',
                                  TfidfVectorizer(lowercase=False,
                                                  tokenizer=<function ident_tokener at 0x7f6044121a60>)),
                                 ('to_dence',
                                  <modules.model.text_classify.SparsetoDense object at 0x7f5fdaf46e50>),
                                 ('embedder',
                                  FeatureUnion(transformer_list=[('pca',
                                                                  PCA(n_components=16)),
                                                                 ('identity',
                                                                  <modules.model.text_classify.Transer object at 0x7f5fda94a160>)])),
                                 ('classifier',
                                  LGBMClassifier(importance_type='gain',
                                                 num_class=9,
                                                 objective='softmax'))]))])


def _tokenize(x: str) -> list:
    sentences = x.split(" ")
    tokenized = tokenizer.transform([sentences])
    return tokenized[0]


def make_feature_text2(idx: int) -> str:
    feature = " ".join(X_valid[idx])    # join all lines with space
    return feature


explainer2 = LimeTextExplainer(class_names=labels, split_expression=_tokenize)


explr = Explainer(explainer2, make_feature_text2, pipe2)


## 正判定のテキストの説明
explr.explain(indices_samples_correct)

idx: 1223 actual: movie-enter prediction: movie-enter
x: 『アバター』を抜くか？　“最強”ヒーローが勢揃いの日本版オリジナルポスター公開 　全世界で公開されると、わずか10日間で興行収入 ...

idx: 183 actual: smax prediction: smax
x: こんなアプリもあったのか！スマートフォンがマウスになる、あると便利な「Advanced Touchpad (リモートマウス)」【 ...


# 誤判定文書の解析
explr.explain(indices_samples_wrong)

idx: 2155 actual: livedoor-homme prediction: kaden-channel
x: 体に当てれば水分が取れる！ 話題のマイクロフィバータオルが人気ブランドからリリース！ こすらずに体に当てるだけで水分が取れる—— ...

idx: 614 actual: livedoor-homme prediction: movie-enter
x: 快挙！NHKスペシャルが国際エミー賞を受賞 世界の優れたテレビ番組に贈られる国際エミー賞の時事問題部門に、南米チリの鉱山事故の救 ...


from IPython.display import HTML

HTML('''<script src="https://code.jquery.com/jquery-3.3.1.slim.min.js">
</script>
<script>
code_show=true; 
function code_toggle() {
 if (code_show){
  $('div.jp-Cell-inputArea').hide();
 } else {
  $('div.jp-Cell-inputArea').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
''')
# <form action="javascript:code_toggle()"><input type="submit" value="Show/hide code"></form>

文書分類の可視化¶

学習済モデル(パイプライン)¶

学習済モデルに対応するデータセット¶

訓練セットの正答率¶

検証セットの正答率¶

注意¶

lgbm 内部で使っているラベルエンコーダのラベルリスト¶

データセットインスタンスのラベルリスト¶

PR Curve¶

Precision - Recall 軸のグラフ¶

Threashold - Precision, Recall 軸のグラフ¶

LIME¶

トークナイザを確認¶

気軽に文書の単語集合をハイライト¶

ポイント！：Pipeline を作り直す¶

正判定のテキストの説明・ハイライト¶

誤判定のテキストの説明・ハイライト¶

参考：青空文庫の文書カテゴリ¶

オリジナル文書に対してハイライト¶

パイプラインを作り直す¶

正判定のテキストの説明・ハイライト¶

誤判定のテキストの説明・ハイライト¶