import os.pathimport mathdef clean_up(s): """ Return a version of string str in which all letters...

Question

import os.pathimport mathdef clean_up(s):    """ Return a version of string str in which all letters have been    converted to lowercase and punctuation characters have been stripped    from both ends. Inner punctuation is left untouched. """    punctuation = '''!"',;:.-?)([]*#
	'''    result = s.lower().strip(punctuation)    return resultdef average_word_length(line_list):    ''' Return the average length of all words in line_list.    Do not include suounding punctuation in words.     text is a non-empty list of strings each ending in 
.    At least one line in text contains a word.'''        # To do: Replace this function's body to meet its specification.    return 1.0def unique_words_ratio(line_list):    ''' Return the type token ratio (TTR) for this line_list.    TTR is the number of different words divided by the total number of words.    text is a non-empty list of strings each ending in 
.    At least one line in text contains a word. '''    # To do: Replace this function's body to meet its specification.    return 1.0def hapax_legomana_ratio(line_list):    ''' Return the hapax_legomana ratio for this text.    This ratio is the number of words that occur exactly once divided    by the total number of words.    text is a list of strings each ending in 
.    At least one line in text contains a word.'''        # To do: Replace this function's body to meet its specification.    return 1.0def split_text(original, separators):    '''Return a list of non-empty, non-blank strings from the original string    determined by splitting the string on any of the separators.    separators is a string of single-character separators.'''    # To do: Complete this function's body to meet its specification.    result = []    return resultdef average_sentence_length(text_from_file):    ''' Return the average number of words per sentence in text.    text_from_file is guaranteed to have at least one sentence.    Sentence terminating punctuation defined as !?.    A sentence is defined as a non-empty string of non-terminating    punctuation suounded by terminating punctuation    or beginning or end of file. '''    # To do: Replace this function's body to meet its specification.    return 1.0def average_sentence_complexity(text_from_file):    '''Return the average number of phrases per sentence.    Terminating punctuation defined as !?.    A sentence is defined as a non-empty string of non-terminating    punctuation suounded by terminating punctuation    or beginning or end of file.    Phrases are substrings of a sentences separated by    one or more of the following delimiters ,;: '''    # To do: Replace this function's body to meet its specification.    return 1.0def get_valid_filename(prompt):    '''Use prompt (a string) to ask the user to type the name of a file. If    the file does not exist, keep asking until they give a valid filename.    Return the name of that file.'''    # To do: Complete this function's body to meet its specification.    filename = input(prompt)    return filename    # Uncomment and use this statement as many times as needed for input:    # filename = input(prompt)    # Uncomment and use this statement as many times as needed for output:    # print "That file does not exist."    # Do not use any other input or output statements in this function.def read_directory_name(prompt):    '''Use prompt (a string) to ask the user to type the name of a directory. If    the directory does not exist, keep asking until they give a valid directory.    '''    # To do: Complete this function's body to meet its specification.    dirname = input(prompt)    return dirname    # Uncomment and use this statement as many times as needed for input:    # dirname = input(prompt)    # Uncomment and use this statement as many times as needed for output:    # print("That directory does not exist.")    # Do not use any other input or output statements in this function.def compare_signatures(sig1, sig2, weight):    '''Return a non-negative real number indicating the similarity of two    linguistic signatures. The smaller the number the more similar the    signatures. Zero indicates identical signatures.    sig1 and sig2 are 6 element lists with the following elements    0  : author name (a string)    1  : average word length (float)    2  : TTR (float)    3  : Hapax Legomana Ratio (float)    4  : average sentence length (float)    5  : average sentence complexity (float)    weight is a list of multiplicative weights to apply to each    linguistic feature. weight[0] is ignored.    '''    # To do: Replace this function's body to meet its specification.    return  0.0def read_signature(filename):    '''Read a linguistic signature from filename and return it as    list of features. '''    file = open(filename, 'r')    # the first feature is a string so it doesn't need casting to float    result = [file.readline()]    # all remaining features are real numbers    for line in file: XXXXXXXXXXresult.append(float(line.strip()))    return resultif __name__ == '__main__':    prompt = 'enter the name of the file with unknown author: '    mystery_filename = get_valid_filename(prompt)    # readlines gives us a list of strings one for each line of the file    text = open(mystery_filename, 'r').readlines()     # calculate the signature for the mystery file    mystery_signature = [mystery_filename]    mystery_signature.append(average_word_length(text))    mystery_signature.append(unique_words_ratio(text))    mystery_signature.append(hapax_legomana_ratio(text))    mystery_signature.append(average_sentence_length(text))    mystery_signature.append(average_sentence_complexity(text))    weights = [0, 11, 33, 50, 0.4, 4]    prompt = 'enter the path to the directory of signature files: '    dir = read_directory_name(prompt)    # every file in this directory must be a linguistic signature    files = os.listdir(dir)    # we will assume that there is at least one signature in that directory    this_file = files[0]    signature = read_signature('%s/%s'%(dir,this_file))    best_score = compare_signatures(mystery_signature, signature, weights)    best_author = signature[0]    for this_file in files[1:]: XXXXXXXXXXsignature = read_signature('%s/%s'%(dir, this_file)) XXXXXXXXXXscore = compare_signatures(mystery_signature, signature, weights) XXXXXXXXXXif score  XXXXXXXXXXbest_score = score XXXXXXXXXXbest_author = signature[0]    print("best author match: %s with score %s"%(best_author, best_score)) Complete and test (using module test_detect.py) an authorship detection programme using the outlined process.The task.You are given a program, detect.py, which will take a text file as input and calculate the score for certain patterns in the file. These scores can be compared with the previously calculated scores for texts written by known authors. You have a set of files containing mystery texts and files that contain signatures (author names) for the mystery texts. You will need to write the below functions that calculate a score for a particular literary feature of the text.    Linguistic Features    Description    Function name    Average word length     The average number of characters per word, calculated after the punctuation has been stripped using the clean_up function (rounded to 2 decimal places)    average_word_length()    Type token ratio     The number of different words used in a text divided by the total number of words (measures how repetitive the vocabulary is). Use the provided clean_up function so "this","This","this," and "(this" are not counted as different words    unique_words_ratio()    Hapax Legomena ratio     The number of words occuing exactly once in the text divided by the total number of words    hapax_legomena_ratio()    Average number of words per sentence    Calculate the mean value for all sentences in the text    average_words_in_sentence()    Average number of phrases per sentence    Average number of phrases per sentence. Find the phrases by taking each sentence, as defined above, and splitting it on any of colon, semi-colon or comma.     average_sentence_complexity()Definitions:· Token = string that you get from calling the string method split on a line of the file. · Word = non-empty token from the file that isn't completely made up of punctuation. Find the words in a file by using str.split to find the tokens and then remove the punctuation from the words using the clean_up function in detect.py. If after calling clean_up the resulting word is an empty string, then it isn't considered a word. · Sentence = sequence of characters that: is terminated by (but doesn't include) the characters ! ? . or the end of the file; excludes whitespace on either end; and is not empty. Create a single string that contains the entire file and then call split_text on that string.· Phrases = non-empty sections of sentences that are separated by colons, commas, or semi-colons (:,;)TASK 1: Since several features require the program to split a string on any of a set of different separators, write a helper function to do this task. To do this you will complete the function split_text as described by the docstring in the code

Ximi · Accepted Answer

import os.path
import math
def clean_up(s):
    """ Return a version of string str in which all letters have been
    converted to lowercase and punctuation characters have been stripped
    from both ends. Inner punctuation is left untouched. """
    punctuation = '''!"',;:.-?)([]*#
	'''
    result = s.lower().strip(punctuation)
    return result
def average_word_length(line_list):
    ''' Return the average length of all words in line_list.
    Do not include surrounding punctuation in words. 
    text is a non-empty list of strings each ending in 
.
    At least one line in text contains a word.'''
    
    # To do: Replace this function's body to meet its specification.
    words_length = [
        len(clean_up(word)) for line in line_list for word in line.strip('
').split()  
    ]
    avg_word_len = sum(words_length)/len(words_length)
    # print (avg_word_len)
    return avg_word_len
def unique_words_ratio(line_list):
    ''' Return the type token ratio (TTR) for this line_list.
    TTR is the number of different words divided by the total number of words.
    text is a non-empty list of strings each ending in 
.
    At least one line in text contains a word. '''
    # To do: Replace this function's body to meet its specification.
    words_all = [
        clean_up(word) for line in line_list for word in line.strip('
').split()  
    ]
    words_unique = set(words_all)
    uniq_words_ratio = len(words_unique)/len(words_all)
    # print (uniq_words_ratio)
    return uniq_words_ratio
def hapax_legomana_ratio(line_list):
    ''' Return the hapax_legomana ratio for this text.
    This ratio is the number of words that occur exactly once divided
    by the total number of words.
    text is a list of strings each ending in 
.
    At least one line in text contains a word.'''
    
    # To do: Replace this function's body to meet its specification.
    words_all = [
        word for line in line_list for word in line.strip('
').split()  
    ]
    words_unique = set(words_all)
    ratio = len(words_unique)/len(words_all)
    # print (ratio)
    return ratio
def split_text(original, separators):
    '''Return a list of non-empty, non-blank strings from the original string
    determined by splitting the string on any of the separators.
    separators is a string of single-character separators.'''
    # To do:

import os.path import math def clean_up(s): """ Return a version of string str in which all letters have been converted to lowercase and punctuation characters have been stripped from both ends. Inner...

Solution

Answer To This Question Is Available To Download

Related Questions & Answers

Submit New Assignment