scripts/stats.py

# ========================================================================
# Copyright 2018 Emory University
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========================================================================
import glob
import json

import os
from collections import Counter, OrderedDict

from tabulate import tabulate

__author__ = 'Jinho D. Choi'

SEASON_ID = 'season_id'
EPISODES = 'episodes'
EPISODE_ID = 'episode_id'
EPISODE = 'episode'
SCENES = 'scenes'
SCENE_ID = 'scene_id'
UTTERANCES = 'utterances'
UTTERANCE_ID = 'utterance_id'
SPEAKERS = 'speakers'
TRANSCRIPT = 'transcript'
TRANSCRIPT_WITH_NOTE = 'transcript_with_note'
TOKENS = 'tokens'
TOKENS_WITH_NOTE = 'tokens_with_note'
CHARACTER_ENTITIES = 'character_entities'
EMOTION = 'emotion'
CAPTION = 'caption'


def ordered_print(json_file, s=None):
    def pair(key, d):
        s = d[key]
        if isinstance(s, str): s = ' '.join(s.split())
        return key, s

    if s is None: s = json.load(open(json_file))
    season = OrderedDict([pair(SEASON_ID, s), pair(EPISODES, s)])
    if len(s) != len(season): print('Error: 0')
    episodes = season[EPISODES]

    for i, e in enumerate(episodes):
        episode = OrderedDict([pair(EPISODE_ID, e), pair(SCENES, e)])
        if len(e) != len(episode): print('Error: 1')
        episodes[i] = episode
        scenes = episode[SCENES]

        for j, c in enumerate(scenes):
            scene = OrderedDict([pair(SCENE_ID, c), pair(UTTERANCES, c)])
            if len(c) != len(scene): print('Error: 2')
            scenes[j] = scene
            utterances = scene[UTTERANCES]

            for k, u in enumerate(utterances):
                utterance = [
                    pair(UTTERANCE_ID, u),
                    pair(SPEAKERS, u),
                    pair(TRANSCRIPT, u),
                    pair(TRANSCRIPT_WITH_NOTE, u),
                    pair(TOKENS, u),
                    pair(TOKENS_WITH_NOTE, u)]

                if CHARACTER_ENTITIES in u: utterance.append(pair(CHARACTER_ENTITIES, u))
                if EMOTION in u: utterance.append(pair(EMOTION, u))
                if CAPTION in u: utterance.append(pair(CAPTION, u))

                if len(u) != len(utterance): print('Error: 3')
                utterances[k] = OrderedDict(utterance)

    with open(json_file+'.v2','w') as fout:
        json.dump(season, fout, indent=4)


def general_stats(json_file):
    num_scenes = 0
    num_utterances = 0
    num_utterances_wn = 0
    num_sentences = 0
    num_sentences_wn = 0
    num_tokens = 0
    num_tokens_wn = 0
    all_speakers = set()

    season = json.load(open(json_file))
    episodes = season[EPISODES]

    for episode in episodes:
        scenes = episode[SCENES]
        num_scenes += len(scenes)

        for scene in scenes:
            utterances = scene[UTTERANCES]
            num_utterances_wn += len(utterances)

            for utterance in utterances:
                all_speakers.update(utterance[SPEAKERS])

                tokens = utterance[TOKENS]
                if tokens:
                    num_utterances += 1
                    num_sentences += len(tokens)
                    num_tokens += sum([len(t) for t in tokens])

                tokens_wn = utterance[TOKENS_WITH_NOTE] or tokens
                num_sentences_wn += len(tokens_wn)
                num_tokens_wn += sum([len(t) for t in tokens_wn])

    return [season['season_id'], len(episodes), num_scenes, num_utterances, num_sentences, num_tokens, all_speakers, num_utterances_wn, num_sentences_wn, num_tokens_wn]


def print_general_stats(json_dir):
    all_speakers = set()
    print('\t'.join(['Season ID', 'Episodes', 'Scenes', 'Utterances', 'Sentences', 'Tokens', 'Speakers']))
    for json_file in sorted(glob.glob(os.path.join(json_dir, '*.json'))):
        l = general_stats(json_file)
        all_speakers.update(l[6])
        l[6] = len(l[6])
        print('\t'.join(map(str, l)))
    print('All speakers: %s' % (len(all_speakers)))


def entity_stats(json_dir):
    g_speaker_list = []
    g_entity_list = []

    table_header = ['Season ID', 'Episodes', 'Scenes', 'Utterances', 'Tokens', 'Speakers', 'Entities', 'Singular', 'Plural', 'Mentions']

    table_data = []
    for k, json_file in enumerate(sorted(glob.glob(os.path.join(json_dir, '*.json')))):
        if k >= 4: break
        speaker_list = []
        entity_list = []
        num_clusters = 0
        num_scenes = 0
        num_utterances = 0
        num_tokens = 0
        num_mentions = 0
        num_singular_mentions = 0
        num_plural_mentions = 0
        entity_types = [0, 0, 0, 0, 0]

        season = json.load(open(json_file))
        episodes = season[EPISODES]

        for episode in episodes:
            scenes = episode[SCENES]

            for scene in scenes:
                annotated = False
                cluster_set = set()

                for utterance in scene[UTTERANCES]:
                    if CHARACTER_ENTITIES in utterance and len(utterance[TOKENS]) > 0:
                        annotated = True
                        num_utterances += 1
                        num_tokens += len(utterance[TOKENS])
                        speaker_list.extend(utterance[SPEAKERS])

                        for character_entities in utterance[CHARACTER_ENTITIES]:
                            # num_mentions += len(character_entities)
                            for entities in character_entities:
                                if 'Non-Entity' in entities: continue
                                for e in entities[2:]:
                                    entity_list.append(e)
                                    cluster_set.add(e)

                                    if e in {'Girl', 'Girl 1', 'Girl 2', 'Guy', 'Guy 1', 'Man', 'Man 1', 'Man 2', 'Man 3', 'Person 1', 'Person 2', 'Person 3', 'Woman', 'Woman 1', 'Woman 2', 'Woman 3'}:
                                        entity_types[2] += 1
                                    elif e in {'Monica Geller', 'Ross Geller', 'Rachel Green', 'Joey Tribbiani', 'Phoebe Buffay', 'Chandler Bing'}:
                                        entity_types[0] += 1
                                    elif e == '#GENERAL#':
                                        entity_types[3] += 1
                                    elif e == '#OTHER#':
                                        entity_types[4] += 1
                                    else:
                                        entity_types[1] += 1

                                if len(entities) == 3: num_singular_mentions += 1
                                else: num_plural_mentions += 1
                                num_mentions += 1

                if annotated: num_scenes += 1
                num_clusters += len(cluster_set)

        g_speaker_list.extend(speaker_list)
        g_entity_list.extend(entity_list)
        table_data.append([season[SEASON_ID], len(episodes), num_scenes, num_utterances, num_tokens, len(set(speaker_list)), num_singular_mentions, num_plural_mentions, num_mentions, num_clusters, len(set(entity_list))])

    print(tabulate(table_data, headers=table_header, tablefmt='plain'))
    print('\nAll speakers: %s' % (len(set(g_speaker_list))))
    print('All entities: %s' % (len(set(g_entity_list))))


if __name__ == '__main__':
    json_dir = '../json'
    # print_general_stats(json_dir)
    entity_stats(json_dir)
    #
    # # for json_file in sorted(glob.glob(os.path.join(json_dir, '*.json'))):
    # #     print(json_file)
    # #     ordered_print(json_file)