Tutorials

All example scripts presented on this page can be found in the repository in the examples directory.

Setup, Retrieval, Serialization

import germaparlpy.utilities as utilities
from germaparlpy.corpus import *

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")

if __name__ == "__main__":

    # clone the germaparl corpus from github. We specify the parent directory as target since we are one level lower.
    utilities.clone_corpus(directory="..")

    # we deserialize the XML corpus and specify the legislative periods as intervals. The interval range(16,20) in python
    # comprises 16, 17, 18 and 19 because intervals in python are right-open. Integer representing singular legislative
    # terms are also a valid argument.
    corpus = Corpus.deserialize_from_xml(range(16,20), path="../GermaParlTEI")

    # Retrieval of all speeches from members affiliated with the party SPD:
    partition_spd = corpus.get_speeches_from_party(party="SPD")

    # All speeches are enclosed within a sp element that is annotated with metadata in element attributes. Print all
    # attributes to determine, what to search for.
    # Output: ['who_original', 'party', 'parliamentary_group', 'who', 'name', 'position', 'role']
    unique_element_attributes = utilities.extract_element_attributes(corpus, tag="sp")
    print(unique_element_attributes)

    # After retrieving the attribute names, we can search the corpus for unique values for a certain attribute. Let's
    # assume that you want to have a list of all annotated roles in the parliament.
    # Output: ['mp', 'presidency', 'parliamentary_commissioner', 'misc', 'government']
    unique_role_values = utilities.extract_attribute_values(corpus, tag="sp", attribute="role")
    print(unique_role_values)

    # Let's retrieve all speeches from all members of the cabinet in the corpus.
    partition_chancellor = corpus.get_speeches_from_role(role="goverment")

    # Retrieval methods can be chained. Let's assume that you want to retrieve all speeches from members of the CDU, which
    # are regular members of the parliament that contain the term "Wirtschaft" at least once. You can use the following
    # method chain for this:
    partition = (corpus.get_speeches_from_party(party="CDU")
               .get_speeches_from_role(role="mp")
               .get_speeches_from_keyword(keyword="Wirtschaft"))

    # You can get the actual content from the markup as a list of strings for further processing with toolkit methods:
    all_paragraphs = utilities.get_paragraphs_from_corpus(partition)
    all_interjections = utilities.get_interjections_from_corpus(partition)

    # You can use the built-in methods len() and bool() on corpus or partition objects.
    # Output: Our partition comprises 908 documents and 119462 paragraphs.
    print(f"Our partition comprises {len(partition)} documents and {len(all_paragraphs)} paragraphs.")

    # Partitions objects can be serialized as XML for human inspection, and Corpus and Partition instances can be
    # serialized in JSON for intermediate storage.
    partition.serialize_corpus_as_xml(path="../derived_corpus")

    # JSON serialization of a corpus object
    corpus.serialize(path="backup.json")

    # Deserialize a corpus:
    new_corpus = Corpus.deserialize_from_json(path="backup.json")

TTR Calculation

import matplotlib.pyplot as plt
import germaparlpy.utilities as utilities
from germaparlpy.corpus import *

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")

if __name__ == "__main__":

    # Clone the GermaParl corpus from GitHub. We specify the parent directory as the target since we are one level below.
    utilities.clone_corpus(directory="..")

    # We deserialize the XML corpus without specifying a legislative period. The default value is range(1, 20), which
    # includes the entire corpus.
    corpus = Corpus.deserialize_from_xml(path="../GermaParlTEI")

    # Let's assume we want to calculate the type-token ratio of all German chancellors. The type-token ratio is a
    # simple statistical coefficient that quantifies the complexity of vocabulary.

    # We implement the TTR calculation as a function.
    def calculate_ttr(text: list[str]) -> float:
        text = [speech.split(" ") for speech in text] # tokenize speeches
        text = [token for speech in text for token in speech if token.isalnum()] # remove non-alphanumeric tokens from the speeches
        return len(set(text)) / len(text) # return ttr

    # We define all chancellors
    chancellor_list = [
        "Konrad Adenauer",
        "Ludwig Erhard",
        "Kurt Georg Kiesinger",
        "Willy Brandt",
        "Helmut Schmidt",
        "Helmut Kohl",
        "Gerhard Schröder",
        "Angela Merkel"
    ]

    # We calculate the TTR for all chancellors and collect the results in a dictionary.
    chancellor_ttr = {}

    for chancellor in chancellor_list:
        chancellor_parition = corpus.get_speeches_from_politician(person=chancellor)
        chancellor_speeches = utilities.get_paragraphs_from_corpus(chancellor_parition)
        chancellor_ttr[chancellor] = calculate_ttr(chancellor_speeches)

    # Output:
    # {'Konrad Adenauer': 0.06581790181141273, 'Ludwig Erhard': 0.07735575796964228,
    # 'Kurt Georg Kiesinger': 0.07238602465784993, 'Willy Brandt': 0.05436169529177415,
    # 'Helmut Schmidt': 0.04721755524197501, 'Gerhard Schröder': 0.0548574862993217,
    # 'Angela Merkel': 0.04755084983588955}
    print(chancellor_ttr)

    # sort and visualize results
    sorted_data = dict(sorted(chancellor_ttr.items(), key=lambda item: item[1], reverse=True))
    plt.figure(figsize=(10, 6))
    plt.bar(sorted_data.keys(), sorted_data.values(), color='skyblue')
    plt.xlabel('Chancellor')
    plt.ylabel('TTR')
    plt.title('TTR of all german chancellors')
    plt.xticks(rotation=45)
    plt.show()

Sentiment Analysis of selected speeches

import germaparlpy.utilities as utilities
from germaparlpy.corpus import *
from germansentiment import SentimentModel

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")

if __name__ == "__main__":

    # We deserialize the corpus and specify the 19. legislative term.
    corpus = Corpus.deserialize_from_xml(lp=19, path="../GermaParlTEI")

    # We load a BERT Model trained for german sentiment classification named "german-sentiment-bert" by guhr et al. (2020)
    sentiment_model = SentimentModel()

    # We retrieve all speeches from the party CDU that contain the keyword "Asyl" or the keywords "Migration"
    corpus_cdu = corpus.get_speeches_from_party("CDU").get_speeches_from_word_list(["Asyl", "Migration"])

    # We retrieve all speeches from the party AfD that contain the keyword "Asyl" or the keywords "Migration"
    corpus_afd = corpus.get_speeches_from_party("AfD").get_speeches_from_word_list(["Asyl", "Migration"])

    # We extract the text from the markup for further processing
    corpus_cdu_paragraphs = utilities.get_paragraphs_from_corpus(corpus_cdu)
    corpus_afd_paragraphs = utilities.get_paragraphs_from_corpus(corpus_afd)

    # Output:
    # The corpus partition containing the speeches of the cdu comprises 7560 paragraphs.
    # The corpus partition containing the speeches of the cdu comprises 8218 paragraphs.
    print(f"The corpus partition containing the speeches of the cdu comprises {len(corpus_cdu_paragraphs)} paragraphs.")
    print(f"The corpus partition containing the speeches of the cdu comprises {len(corpus_afd_paragraphs)} paragraphs.")

    # We implement the polarity calculation as a function since the library returns a string that we need to transform.
    # We also have to break our data sets into chunks to cause no memory overflow since our partitions are quite big.

    def calculate_polarity(p: list[str]) -> float:
        model = SentimentModel()
        chunk_size = 100
        chunks = [p[i:i + chunk_size] for i in range(0, len(p), chunk_size)]
        total_polarity = 0

        for chunk in chunks:
            chunk_result = model.predict_sentiment(chunk)

            for s in chunk_result:
                if s == 'neutral':
                    continue
                elif s == 'negative':
                    total_polarity -= 1
                else:
                    total_polarity += 1

        return total_polarity / len(p)

    # We calculate the polarity for both data sets.
    corpus_cdu_polarity = calculate_polarity(corpus_cdu_paragraphs)
    corpus_afd_polarity = calculate_polarity(corpus_afd_paragraphs)

    # Output:
    # The mean polarity of speeches in the 19. legislative period of the german parliament from the party CDU
    # containing the word 'asyl' or 'migration' is -0.022389875882209784.
    # The mean polarity of speeches in the 19. legislative period of the german parliament from the party AFD
    # containing the word 'asyl' or 'migration' is -0.11084656084656085.

    print("The mean polarity of speeches in the 19. legislative period of the german parliament from the party CDU "
          f"containing the word 'asyl' or 'migration' is {corpus_cdu_polarity}.")

    print("The mean polarity of speeches in the 19. legislative period of the german parliament from the party AFD "
          f"containing the word 'asyl' or 'migration' is {corpus_afd_polarity}.")

Retrieving speeches with regular expressions

import germaparlpy.utilities as utilities
from germaparlpy.corpus import *
import logging

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") # configure output

if __name__ == "__main__":

    # Clone the GermaParl corpus from GitHub. We specify the parent directory as target since we are one level lower.
    utilities.clone_corpus(directory="..")

    # We deserialize the XML corpus and specify the legislative periods as intervals. The interval range(19,20) in python
    # comprises only 19 because intervals in python are right-open.
    corpus = Corpus.deserialize_from_xml(lp=range(19,20), path="../GermaParlTEI")

    # Let's search speeches in our corpus that match a pattern. Let's assume that we're interested in data politics.
    # For that purpose, we define a regular expression that matches all speeches containing "Daten".
    data_regex = r".*[Dd]aten.*"

    partition_daten = corpus.get_speeches_from_regex(pattern=data_regex)

    # Let's assume that we are interested in interjections that people utter in the context of data.
    partition_daten_interjections = utilities.get_interjections_from_corpus(partition_daten)

    # Output: There are 26167 interjections to speeches containing the term 'Daten'.
    # That's quite a lot since we have also matched unwanted terms like "Soldaten".
    # Regular expressions are computationally intensive and prone to producing false positives if you are not
    # conscientious enough.
    print(f"There are {len(partition_daten_interjections)} interjections to speeches containing the term 'Daten'")

    # Serialize for human inspection.
    partition_daten.serialize_corpus_as_xml(path="../corpus_daten")