diff --git a/.gitignore b/.gitignore index c65a17d..b8da8a2 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ __pycache__ .vscode py_readability_metrics.egg-info dist -build \ No newline at end of file +build +venv \ No newline at end of file diff --git a/readability/readability.py b/readability/readability.py index 3a48d42..c8c4d06 100644 --- a/readability/readability.py +++ b/readability/readability.py @@ -1,13 +1,21 @@ -from .text import Analyzer -from .scorers import ARI, ColemanLiau, DaleChall, Flesch, \ - FleschKincaid, GunningFog, LinsearWrite, Smog, Spache import warnings +import nltk + +from .scorers import (ARI, ColemanLiau, DaleChall, Flesch, FleschKincaid, + GunningFog, LinsearWrite, LixLesbarkeitsIndex, + MiyazakiReadabilityIndex, Smog, Spache, + WienerSachtextformel, Gsmog) +from .text import Analyzer + +nltk.download('punkt_tab') + class Readability: - def __init__(self, text, min_words=100): + def __init__(self, text, min_words=100, language='en'): self._analyzer = Analyzer() self._statistics = self._analyzer.analyze(text) self._min_words = min_words + self._language = language if self._min_words < 100: warnings.warn( "Documents with fewer than 100 words may affect the accuracy of readability tests" @@ -27,7 +35,7 @@ def dale_chall(self): def flesch(self): """Calculate Flesch Reading Ease score.""" - return Flesch(self._statistics, self._min_words).score() + return Flesch(self._statistics, self._min_words, self._language).score() def flesch_kincaid(self): """Calculate Flesch-Kincaid Grade Level.""" @@ -46,6 +54,34 @@ def smog(self,all_sentences=False, ignore_length=False): `all_sentences` indicates whether SMOG should use a sample of 30 sentences, as described in the original paper, or if it should use all sentences in the text""" return Smog(self._statistics, self._analyzer.sentences, all_sentences=all_sentences, ignore_length=ignore_length).score() + + def gsmog(self, ignore_length=False): + """GSMOG Index. Measure the SMOG score adapted for German text""" + return Gsmog(self._statistics, ignore_length=ignore_length).score() + + def erste_wiener_sachtextformel(self): + """erste Wiener Sachtextformel.""" + return WienerSachtextformel(self._statistics, self._min_words).erste_wiener_sachtextformel_score() + + def zweite_wiener_sachtextformel(self): + """zweite Wiener Sachtextformel.""" + return WienerSachtextformel(self._statistics, self._min_words).zweite_wiener_sachtextformel_score() + + def dritte_wiener_sachtextformel(self): + """dritte Wiener Sachtextformel.""" + return WienerSachtextformel(self._statistics, self._min_words).dritte_wiener_sachtextformel_score() + + def vierte_wiener_sachtextformel(self): + """vierte Wiener Sachtextformel.""" + return WienerSachtextformel(self._statistics, self._min_words).vierte_wiener_sachtextformel_score() + + def lix_lesbarkeits_index(self): + """LIX Lesbarkeitsindex.""" + return LixLesbarkeitsIndex(self._statistics, self._min_words).score() + + def miyazaki_readability_index(self): + """Miyazaki Readability Index.""" + return MiyazakiReadabilityIndex(self._statistics, self._min_words).score() def spache(self): """Spache Index.""" @@ -59,4 +95,6 @@ def statistics(self): 'num_polysyllabic_words': self._statistics.num_poly_syllable_words, 'avg_words_per_sentence': self._statistics.avg_words_per_sentence, 'avg_syllables_per_word': self._statistics.avg_syllables_per_word, + 'num_six_letter_words': self._statistics.num_six_letter_words, + 'num_mono_syllable_words': self._statistics.num_mono_syllable_words, } diff --git a/readability/scorers/__init__.py b/readability/scorers/__init__.py index df708e8..c42e794 100644 --- a/readability/scorers/__init__.py +++ b/readability/scorers/__init__.py @@ -1,10 +1,14 @@ +from .ari import ARI +from .coleman_liau import ColemanLiau +from .dale_chall import DaleChall from .flesch import Flesch from .flesch_kincaid import FleschKincaid +from .gsmog import Gsmog from .gunning_fog import GunningFog -from .coleman_liau import ColemanLiau -from .dale_chall import DaleChall -from .ari import ARI from .linsear_write import LinsearWrite +from .lix import LixLesbarkeitsIndex +from .miyazaki_readability_index import MiyazakiReadabilityIndex from .smog import Smog from .spache import Spache +from .wiener_sachtextformel import WienerSachtextformel diff --git a/readability/scorers/flesch.py b/readability/scorers/flesch.py index 7d35cc0..25cbd3f 100644 --- a/readability/scorers/flesch.py +++ b/readability/scorers/flesch.py @@ -13,8 +13,9 @@ def __str__(self): class Flesch: - def __init__(self, stats, min_words=100): + def __init__(self, stats, min_words=100, language='en'): self._stats = stats + self._language = language if stats.num_words < min_words: raise ReadabilityException('{} words required.'.format(min_words)) @@ -27,38 +28,78 @@ def score(self): def _score(self): stats = self._stats - words_per_sent = stats.num_words / stats.num_sentences - syllables_per_word = stats.num_syllables / stats.num_words - return 206.835 - (1.015 * words_per_sent) - (84.6 * syllables_per_word) + if self._language == 'en': + words_per_sent = stats.num_words / stats.num_sentences + syllables_per_word = stats.num_syllables / stats.num_words + return 206.835 - (1.015 * words_per_sent) - (84.6 * syllables_per_word) + elif self._language == 'de': + words_per_sent = stats.num_words / stats.num_sentences + syllables_per_word = stats.num_syllables / stats.num_words + return 180 - words_per_sent - (58.5 * syllables_per_word) + else: + raise ReadabilityException('Unsupported language: {}'.format(self._language)) + def _ease(self, score): - if score >= 90 and score <= 100: - return 'very_easy' - elif score >= 80 and score < 90: - return 'easy' - elif score >= 70 and score < 80: - return 'fairly_easy' - elif score >= 60 and score < 70: - return 'standard' - elif score >= 50 and score < 60: - return 'fairly_difficult' - elif score >= 30 and score < 50: - return 'difficult' - else: - return 'very_confusing' + if self._language == 'en': + if score >= 90 and score <= 100: + return 'very_easy' + elif score >= 80 and score < 90: + return 'easy' + elif score >= 70 and score < 80: + return 'fairly_easy' + elif score >= 60 and score < 70: + return 'standard' + elif score >= 50 and score < 60: + return 'fairly_difficult' + elif score >= 30 and score < 50: + return 'difficult' + else: + return 'very_confusing' + elif self._language == 'de': + if score >= 90 and score <= 100: + return 'sehr_leicht' + elif score >= 80 and score < 90: + return 'leicht' + elif score >= 70 and score < 80: + return 'mittel_leicht' + elif score >= 60 and score < 70: + return 'mittel' + elif score >= 50 and score < 60: + return 'mittel_schwer' + elif score >= 30 and score < 50: + return 'schwer' + else: + return 'sehr_schwer' def _grade_levels(self, score): - if score >= 90 and score <= 100: - return ['5'] - elif score >= 80 and score < 90: - return ['6'] - elif score >= 70 and score < 80: - return ['7'] - elif score >= 60 and score < 70: - return ['8', '9'] - elif score >= 50 and score < 60: - return ['10', '11', '12'] - elif score >= 30 and score < 50: - return ['college'] - else: - return ['college_graduate'] + if self._language == 'en': + if score >= 90 and score <= 100: + return ['5'] + elif score >= 80 and score < 90: + return ['6'] + elif score >= 70 and score < 80: + return ['7'] + elif score >= 60 and score < 70: + return ['8', '9'] + elif score >= 50 and score < 60: + return ['10', '11', '12'] + elif score >= 30 and score < 50: + return ['college'] + else: + return ['college_graduate'] + elif self._language == 'de': + if score >= 90 and score <= 100: + return ['11'] + elif score >= 80 and score < 90: + return ['11, 12'] + elif score >= 70 and score < 80: + return ['11, 12'] + elif score >= 60 and score < 70: + return ['13, 14, 15'] + elif score >= 50 and score < 60: + return ['13, 14, 15'] + elif score >= 30 and score < 50: + return ['13, 14, 15'] + else: + return ['Akademikerinnen und Akademiker'] diff --git a/readability/scorers/gsmog.py b/readability/scorers/gsmog.py new file mode 100644 index 0000000..1733aac --- /dev/null +++ b/readability/scorers/gsmog.py @@ -0,0 +1,54 @@ +import math +import warnings + +from readability.exceptions import ReadabilityException + + +class Result: + def __init__(self, score, grade_level): + self.score = score + self.grade_level = grade_level + + def __str__(self): + return "score: {}, grade_level: {}". \ + format(self.score, self.grade_level) + + +class Gsmog: + def __init__(self, stats, ignore_length=False): + """ + Bamberger adapted McLaughlin's original formula (Harry McLaughlin, 1969 https://ogg.osu.edu/media/documents/health_lit/WRRSMOG_Readability_Formula_G._Harry_McLaughlin__1969_.pdf) + for German-speaking countries. The formula compares the number of multisyllabic words (three or more, see above) to the number of sentences in the entire text. Since the original formula refers to a + sample of 30 sentences, the implementation in this class uses 30 sentences as a default if all_sentences is False. + """ + if stats.num_sentences < 30: + if not ignore_length: + raise ReadabilityException( + 'SMOG requires 30 sentences. {} found' + .format(stats.num_sentences)) + else: + warnings.warn( + 'SMOG requires 30 sentences. {} found' + .format(stats.num_sentences)) + + + self._stats = stats + + + def score(self): + score = self._score() + grade_level = self._grade_level(score) + return Result( + score=score, + grade_level=grade_level + ) + + def _score(self): + + num_sentences = self._stats.num_sentences + num_complex_words = self._stats.num_poly_syllable_words # words with 3 or more syllables + return math.sqrt(30 * num_complex_words / num_sentences) - 2 + + def _grade_level(self, score): + return str(round(score)) + diff --git a/readability/scorers/lix.py b/readability/scorers/lix.py new file mode 100644 index 0000000..a547f48 --- /dev/null +++ b/readability/scorers/lix.py @@ -0,0 +1,62 @@ +from readability.exceptions import ReadabilityException + + +class Result: + def __init__(self, score, grade_levels, ease): + self.score = score + self.ease = ease + self.grade_levels = grade_levels + + def __str__(self): + return "score: {}, ease: '{}', grade_levels: {}". \ + format(self.score, self.ease, self.grade_levels) + + + +class LixLesbarkeitsIndex: + def __init__(self, stats, min_words=100): + self._stats = stats + if stats.num_words < min_words: + raise ReadabilityException('{} words required.'.format(min_words)) + + def score(self): + score = self._score() + return Result( + score=score, + ease=self._ease(score), + grade_levels=self._grade_levels(score) + ) + + def _score(self): + """ + Calculates the Lix readability index + :param avg_words_per_sentence: mean sentence length + :param ratio_long_words: ratio of words with six or more characters + :return: Lix index + """ + stats = self._stats + return stats.avg_words_per_sentence + stats.avg_num_six_letter_words + + def _ease(self, score): + if score >= 4 and score <= 5: + return 'very_easy' + elif score >=6 and score <=7: + return 'easy' + elif score >=8 and score <=10: + return 'average' + elif score >=11 and score <=12: + return 'difficult' + else: + return 'very_difficult' + + def _grade_levels(self, score): + if score >= 4 and score <= 5: + return [4, 5] + elif score >=6 and score <=7: + return [6, 7] + elif score >=8 and score <=10: + return [8, 9, 10] + elif score >=11 and score <=12: + return [11, 12] + else: + return ['college level and above'] diff --git a/readability/scorers/miyazaki_readability_index.py b/readability/scorers/miyazaki_readability_index.py new file mode 100644 index 0000000..8cf6323 --- /dev/null +++ b/readability/scorers/miyazaki_readability_index.py @@ -0,0 +1,75 @@ +from readability.exceptions import ReadabilityException + + +class Result: + def __init__(self, score, grade_levels, ease): + self.score = score + self.ease = ease + self.grade_levels = grade_levels + + def __str__(self): + return "score: {}, ease: '{}', grade_levels: {}". \ + format(self.score, self.ease, self.grade_levels) + + + +class MiyazakiReadabilityIndex: + def __init__(self, stats, min_words=100): + self._stats = stats + if stats.num_words < min_words: + raise ReadabilityException('{} words required.'.format(min_words)) + + def score(self): + score = self._score() + return Result( + score=score, + ease=self._ease(score), + grade_levels=self._grade_levels(score) + ) + + def _score(self): + """ + Calculates the Miyazaki English as a Foreign Language Readability Index by Greenfiel 1999 + It is parametrized for Japanes L2 speakers of English, who are students and read academic texts. + Average score of 50, ranges between 100 and minus infinity + + Formula: 164.935 - 18.792 * word_length - 1.916 * sentence_length + + :param word_length: average word length in characters + :param sentence_length: average sentence length in words + :return: ML2RI + """ + stats = self._stats + return 164.935 - 18.792 * stats.num_letters - 1.916 * stats.num_words + + def _ease(self, score): + if score >= 91 and score <= 100: + return 'very_easy' + elif score >= 81 and score <= 90: + return 'easy' + elif score >= 71 and score <= 80: + return 'Fairly easy' + elif score >= 61 and score <= 70: + return 'standard' + elif score >= 51 and score <= 60: + return 'fairly difficult' + elif score >= 31 and score <= 50: + return 'difficult' + elif score < 31: + return 'very_difficult' + + def _grade_levels(self, score): + if score >= 91 and score <= 100: + return ['5'] + elif score >= 81 and score <= 90: + return ['6'] + elif score >= 71 and score <= 80: + return ['7'] + elif score >= 61 and score <= 70: + return ['8', '9'] + elif score >= 51 and score <= 60: + return ['10', '11', '12'] + elif score >= 31 and score <= 50: + return ['post-school/college level'] + elif score < 31: + return ['university graduate'] \ No newline at end of file diff --git a/readability/scorers/wiener_sachtextformel.py b/readability/scorers/wiener_sachtextformel.py new file mode 100644 index 0000000..6279f27 --- /dev/null +++ b/readability/scorers/wiener_sachtextformel.py @@ -0,0 +1,123 @@ +from readability.exceptions import ReadabilityException + + +class Result: + def __init__(self, score, grade_levels, ease): + self.score = score + self.ease = ease + self.grade_levels = grade_levels + + def __str__(self): + return "score: {}, ease: '{}', grade_levels: {}". \ + format(self.score, self.ease, self.grade_levels) + + +class WienerSachtextformel: + def __init__(self, stats, min_words=100): + self._stats = stats + if stats.num_words < min_words: + raise ReadabilityException('{} words required.'.format(min_words)) + + def erste_wiener_sachtextformel_score(self): + score = self._erste_wiener_sachtextformel_score() + return Result( + score=score, + ease=self._ease(score), + grade_levels=self._grade_levels(score) + ) + + def zweite_wiener_sachtextformel_score(self): + score = self._zweite_wiener_sachtextformel_score() + return Result( + score=score, + ease=self._ease(score), + grade_levels=self._grade_levels(score) + ) + + def dritte_wiener_sachtextformel_score(self): + score = self._dritte_wiener_sachtextformel_score() + return Result( + score=score, + ease=self._ease(score), + grade_levels=self._grade_levels(score) + ) + + def vierte_wiener_sachtextformel_score(self): + score = self._vierte_wiener_sachtextformel_score() + return Result( + score=score, + ease=self._ease(score), + grade_levels=self._grade_levels(score) + ) + + def _erste_wiener_sachtextformel_score(self): + """ + The first Wiener Sachtextformel + WSTF1 considers all four main factors: sentence length, sentence count, proportion of long words, and proportion of monosyllabic words. + + The formula is: + 0.1935 * ratio of words with >= 3 syllables + 0.1672 * mean sentence length + + 0.1297 * ratio of words with >= 6 letters - 0.0327 * ratio of words with 1 syllable - 0.875 + """ + stats = self._stats + return (0.1935 * (stats.num_poly_syllable_words / stats.num_words)) + (0.1672 * stats.avg_words_per_sentence) + \ + (0.1297 * (stats.num_six_letter_words / stats.num_words)) - (0.0327 * (stats.num_mono_syllable_words / stats.num_words)*100) - 0.875 + + def _zweite_wiener_sachtextformel_score(self): + """ + The second Wiener Sachtextformel + WSTF2 is similar to WSTF1, but weights the factors slightly differently, omitting the proportion of monosyllabic words. + + The formula is: + 0.2007 * ratio of words with >= 3 syllables + 0.1682 * mean sentence length + + 0.1373 * ratio of words with >= 6 letters - 2.779 + """ + stats = self._stats + return (0.2007 * (stats.num_poly_syllable_words / stats.num_words)) + (0.1682 * stats.avg_words_per_sentence) + \ + (0.1373 * (stats.num_six_letter_words / stats.num_words)) - 2.779 + + def _dritte_wiener_sachtextformel_score(self): + """ + The third Wiener Sachtextformel + WSTF3 is the simplest formula because it only takes into account the mean sentence length and the proportion of long words. + + The formula is: + 0.2963 * ratio of words with >= 3 syllables + 0.1905 * mean sentence length - 1.1144 + """ + stats = self._stats + return (0.2963 * (stats.num_poly_syllable_words / stats.num_words)) + (0.1905 * stats.avg_words_per_sentence) - 1.1144 + + def _vierte_wiener_sachtextformel_score(self): + """ + The fourth Wiener Sachtextformel + WSTF4 focuses specifically on readability in relation to school levels, which is why the weighting of sentence length is greater. + + The formula is: + 0.2744 * ratio of words with >= 3 syllables + 0.2656 * mean sentence length - 1.693 + """ + stats = self._stats + return (0.2744 * (stats.num_poly_syllable_words / stats.num_words)) + (0.2656 * stats.avg_words_per_sentence) - 1.693 + + def _ease(self, score): + if score >= 4 and score <= 5: + return 'very_easy' + elif score >=6 and score <=7: + return 'easy' + elif score >=8 and score <=10: + return 'average' + elif score >=11 and score <=12: + return 'difficult' + else: + return 'very_difficult' + + def _grade_levels(self, score): + if score >= 4 and score <= 5: + return ['4', '5'] + elif score >=6 and score <=7: + return ['6', '7'] + elif score >=8 and score <=10: + return ['8', '9', '10'] + elif score >=11 and score <=12: + return ['11', '12'] + else: + return ['college level and above'] diff --git a/readability/text/analyzer.py b/readability/text/analyzer.py index dce409e..a0c205f 100644 --- a/readability/text/analyzer.py +++ b/readability/text/analyzer.py @@ -1,8 +1,10 @@ import os import re -from .syllables import count as count_syllables -from nltk.tokenize import sent_tokenize, TweetTokenizer + from nltk.stem.porter import PorterStemmer +from nltk.tokenize import TweetTokenizer, sent_tokenize + +from .syllables import count as count_syllables class AnalyzerStatistics: @@ -40,7 +42,19 @@ def num_dale_chall_complex(self): @property def num_spache_complex(self): return self.stats['num_spache_complex'] - + + @property + def num_mono_syllable_words(self): + return self.stats['num_mono_syllable_words'] + + @property + def num_six_letter_words(self): + return self.stats['num_six_letter_words'] + + @property + def avg_num_six_letter_words(self): + return self.stats['num_six_letter_words'] / self.stats['num_words'] if self.stats['num_words'] > 0 else 0 + @property def avg_words_per_sentence(self): return self.num_words / self.num_sentences @@ -75,6 +89,8 @@ def _statistics(self, text): gunning_complex_count = 0 dale_chall_complex_count = 0 spache_complex_count = 0 + mono_syllable_count = 0 + six_letter_word_count = 0 porter_stemmer = PorterStemmer() def is_gunning_complex(t, syllable_count): @@ -97,7 +113,9 @@ def is_spache_complex(t): word_syllable_count = count_syllables(t) syllable_count += word_syllable_count letters_count += len(t) + six_letter_word_count += 1 if len(t) >= 6 else 0 poly_syllable_count += 1 if word_syllable_count >= 3 else 0 + mono_syllable_count += 1 if word_syllable_count == 1 else 0 gunning_complex_count += \ 1 if is_gunning_complex(t, word_syllable_count) \ else 0 @@ -119,6 +137,9 @@ def is_spache_complex(t): 'num_dale_chall_complex': dale_chall_complex_count, 'num_spache_complex': spache_complex_count, 'sentences': sentences, + 'num_mono_syllable_words': mono_syllable_count, + 'num_six_letter_words': six_letter_word_count, + 'avg_words_per_sentence': word_count / sentence_count if sentence_count > 0 else 0, } def _tokenize_sentences(self, text): @@ -153,3 +174,7 @@ def _load_spache(self): spache_path = os.path.join(cur_path, '..', 'data', file) with open(spache_path) as f: return set(line.strip() for line in f) + with open(spache_path) as f: + return set(line.strip() for line in f) + with open(spache_path) as f: + return set(line.strip() for line in f) diff --git a/test/test_readability.py b/test/test_readability.py index 46e0d1b..0430f36 100644 --- a/test/test_readability.py +++ b/test/test_readability.py @@ -1,4 +1,5 @@ import unittest + from readability import Readability @@ -90,3 +91,64 @@ def test_print_stats(self): self.assertEqual(117, stats['num_words']) self.assertEqual(7, stats['num_sentences']) self.assertEqual(20, stats['num_polysyllabic_words']) + +class ReadabilityTestGerman(unittest.TestCase): + def setUp(self): + german_text = """ + In der Linguistik ist der Gunning-Fog-Index ein Lesbarkeitsindex für englische Texte. Der Index schätzt die Jahre formaler Bildung, die eine Person benötigt, um den Text beim ersten Lesen zu verstehen. Ein Fog-Index von 12 erfordert beispielsweise das Leseverständnis eines Schülers der letzten Klasse einer amerikanischen High School (etwa 18 Jahre alt). Der Test wurde 1952 von Robert Gunning, einem amerikanischen Geschäftsmann, der in Zeitungs- und Lehrbuchverlagen tätig war, entwickelt. + Der Fog-Index wird häufig verwendet, um zu bestätigen, dass ein Text für die beabsichtigte Zielgruppe leicht lesbar ist. Texte für ein breites Publikum sollten in der Regel einen Fog-Index von weniger als 12 haben. Texte, die ein nahezu universelles Verständnis erfordern, sollten einen Index von weniger als 8 haben. + """ + self.readability = Readability(german_text, language='de') + + def test_flesch_german(self): + r = self.readability.flesch() + print(r) + self.assertGreaterEqual(r.score, 60 ) + self.assertEqual(['13-15 jährige Schülerinnen und Schüler'], r.grade_levels) + self.assertEqual('mittel', r.ease) + + def test_erste_wiener_sachtextformel_german(self): + r = self.readability.erste_wiener_sachtextformel() + print(r) + self.assertGreaterEqual( r.score, 11.0) + self.assertLessEqual(r.score, 12.0) + self.assertEqual(r.grade_level, ['11th-12th grade']) + + def test_zweite_wiener_sachtextformel_german(self): + r = self.readability.zweite_wiener_sachtextformel() + print(r) + self.assertGreaterEqual( r.score, 11.0) + self.assertLessEqual(r.score, 12.0) + self.assertEqual(r.grade_level, ['11th-12th grade']) + + def test_dritte_wiener_sachtextformel_german(self): + r = self.readability.dritte_wiener_sachtextformel() + print(r) + self.assertGreaterEqual( r.score, 11.0) + self.assertLessEqual(r.score, 12.0) + self.assertEqual(r.grade_level, ['11th-12th grade']) + + def test_vierte_wiener_sachtextformel_german(self): + r = self.readability.vierte_wiener_sachtextformel() + print(r) + self.assertGreaterEqual( r.score, 11.0) + self.assertLessEqual(r.score, 12.0) + self.assertEqual(r.grade_level, ['11th-12th grade']) + + def test_lix_lesbarkeits_index_german(self): + r = self.readability.lix_lesbarkeits_index() + print(r) + self.assertGreaterEqual( r.score, 11.0) + self.assertLessEqual(r.score, 12.0) + self.assertEqual(r.grade_level, ['11th-12th grade']) + + def test_miyazaki_readability_index_german(self): + r = self.readability.miyazaki_readability_index() + print(r) + self.assertGreaterEqual( r.score, 31.0) + self.assertLessEqual(r.score, 50.0) + self.assertEqual(r.grade_level, ['post-school/college level']) + + def test_gsmog_german(self): + r = self.readability.gsmog() + print(r)