html5lib on Azure ML Studio not Running in Python

Patrick Hill 1 Reputation point
2020-11-23T22:36:36.307+00:00

I am trying to run a python script inside Azure ML Studio.
The script loads an external zipped pip html5lib

I can get it to run but when I try to import reference this sub-method it fails.
from html5lib.serializer import SerializeError

I have tried the following:

  1. I have confirmed the python script exists in the html5lib zip file
  2. I have confirmed the root script loads with using 'import html5lib'

Does anyone know what I can try?

Thanks.

Full script below:


Error is:

Error 0085: The following error occurred during script evaluation, please view the output log for more information:
---------- Start of error message from Python interpreter ----------
Caught exception while executing function: Traceback (most recent call last):
File "C:\server\invokepy.py", line 199, in batch
odfs = mod.azureml_main(*idfs)
File "C:\temp\14a349841e5843a59b28ef365e67397a.py", line 195, in azureml_main
dataframe1['sentences']= HTMLSentenceTokenizer().feed(dataframe1['FullEmailContent'])
File "C:\temp\14a349841e5843a59b28ef365e67397a.py", line 69, in init
self.parser = html5lib.HTMLParser()
AttributeError: module 'html5lib' has no attribute 'HTMLParser'
Process returned with non-zero exit code 1

---------- End of error message from Python interpreter ----------
Start time: UTC 11/23/2020 22:35:12
End time: UTC 11/23/2020 22:35:24


The script MUST contain a function named azureml_main

which is the entry point for this module.

import pandas as pd
import nltk
import html5lib
from html5lib.serializer import SerializeError

setup

INLINE_ELEMENTS = {'a', 'abbr', 'acronym', 'b', 'bdi', 'bdo', 'big', 'cite', 'code', 'dfn', 'em', 'i', 'kbd',
'label', 'mark', 'nav', 'output', 'progress', 'q', 's', 'slot', 'small', 'span', 'strong',
'sub', 'sup', 'time', 'tt', 'var', 'wbr'}

does not include pre or textarea (which are accounted for in PRESERVE_WHITESPACE_ELEMENTS

BLOCK_LEVEL_ELEMENTS = {'address', 'article', 'blockquote', 'caption', 'details', 'dialog', 'div', 'dl',
'dt', 'figcaption', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup',
'li', 'main', 'ol', 'p', 'ul', 'section', 'table', 'tbody', 'td', 'th', 'thead', 'tr'}

HEADER_ELEMENTS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup'}

SKIPPED_ELEMENTS = ["br", "hr"]

elements which cannot contain children and denote the end of a sentence

EMPTY_ELEMENTS = {'area', 'base', 'br', 'embed', 'hr', 'img'}

elements which can have children which never qualify as a sentence

SENTENCE_VOID_ELEMENTS = {'button', 'caption', 'col', 'colgroup', 'pre', 'table', 'textarea', 'td', 'tfoot',
'th', 'thead', 'tr'}

class InvalidTagError(Exception):
def init(self, tag_name):
super(InvalidTagError, self).init("Parsing an empty tag which is not of the accepted element types. It is "
"of type {}".format(tag_name))

class HTMLSentenceTokenizer:

def __init__(self, ignore_headers=True, raise_invalid_tags=False):
    """
    :param ignore_headers: If true, ignores text inside of the tags included in HEADER_ELEMENTS. This defaults to
    true because the text inside of these "header elements" is typically not a sentence.
    :param raise_invalid_tags: If true, raises an InvalidTagError when parsing a tag not in INLINE_ELEMENTS,
    BLOCK_LEVEL_ELEMENTS (which includes the elements of HEADER_ELEMENTS), SKIPPED_ELEMENTS, EMPTY_ELEMENTS, or
    SENTENCE_VOID_ELEMENTS. If false, ignores this tag and all of its children. (Sentences descending from it will
    not be included in the value returned from feed)
    """
    # self.parser is an etree parser by default.
    self.parser = html5lib.HTMLParser()
    self.walker = html5lib.getTreeWalker("etree")
    self.sentences = []
    self.ignore_header_text = ignore_headers
    self.raise_invalid_tags = raise_invalid_tags
    self.reset()

def feed(self, markup):
    """
    Given an HTML document which contains tags on only INLINE_ELEMENTS, BLOCK_LEVEL_ELEMENTS, or
    PRESERVE_WHITESPACE_ELEMENTS, parses the HTML document into a BeautifulSoup-like tree represented by Node
    and TextNode objects. Stores these objects in the database. At the end, also resets this SentenceParser object
    by calling the reset() method.
    :return: The root node of the parsed tree.
    """
    etree_document = self.parser.parse(markup)
    stream = self.walker(etree_document)

    # todo: find a more efficient way to only iterate over tags that are a descendant of body
    passed_body = False

    for i in stream:
        if passed_body:
            if i['type'] == 'StartTag':
                self.handle_starttag(i['name'])
            elif i['type'] == 'EndTag':
                if i['name'] == 'body':
                    break
                self.handle_endtag(i['name'])
            elif i['type'] == 'EmptyTag':
                self.handle_empty_tag(i['name'])
            elif i['type'] == 'Characters' or (i['type'] == 'SpaceCharacters' and self.ignored_parent_count > 0):
                self.handle_text(i['data'])
            elif i['type'] == 'SpaceCharacters':
                self.handle_text(' ')
            elif i['type'] == 'SerializeError':
                raise SerializeError(i['data'])
            # else, is a comment, doctype, entity, or unknown.
            else:
                pass
        elif i['type'] == 'StartTag' and i['name'] == 'body':
            passed_body = True

    sentences_copy = self.sentences
    self.reset()
    return sentences_copy

def reset(self):
    self.sentences = []
    self.ignored_parent_count = 0
    self.current_string = ''

def handle_text(self, text):
    if self.ignored_parent_count > 0:
        return

    self.current_string += text

def handle_starttag(self, tag_name):
    # if this tag is the child of an SVE or it is a header element and user would like to ignore headers
    if self.ignored_parent_count > 0:
        if tag_name in SENTENCE_VOID_ELEMENTS or (self.ignore_header_text and tag_name in HEADER_ELEMENTS):
            self.ignored_parent_count += 1
        return

    if tag_name in SENTENCE_VOID_ELEMENTS:
        self.handle_end_of_string()
        self.ignored_parent_count += 1
        return

    if tag_name in BLOCK_LEVEL_ELEMENTS:
        self.handle_end_of_string()
        return

    if tag_name in INLINE_ELEMENTS:
        return

    if self.raise_invalid_tags:
        raise ValueError("Parsing a tag which is not in the accepted element types. It is of type "
                         "{}".format(tag_name))
    else:
        self.ignored_parent_count += 1

def handle_endtag(self, tag_name):
    if tag_name in SENTENCE_VOID_ELEMENTS or (self.ignore_header_text and tag_name in HEADER_ELEMENTS):
        self.ignored_parent_count -= 1
        self.current_string = ''
        return

    # if in an SVE (and this tag is not an SVE).
    if self.ignored_parent_count > 0:
        return

    if tag_name in BLOCK_LEVEL_ELEMENTS:
        self.handle_end_of_string()
        return

    # if tag_name in INLINE_ELEMENTS, nothing is done.

def handle_empty_tag(self, tag_name):
    if tag_name in EMPTY_ELEMENTS:
        self.handle_end_of_string()
    else:
        raise ValueError(
            "Parsing an empty tag which is not of the accepted element types. It is of type {}".format(tag_name))

def handle_end_of_string(self):
    self.current_string = self.current_string.strip()

    if len(self.current_string) == 0:
        return

    current_sentences = sent_tokenize(self.current_string)
    for i in current_sentences:
        i = i.strip()
        self.sentences.append(i)

    self.current_string = ''

The entry point function can contain up to two input arguments:

Param<dataframe1>: a pandas.DataFrame

Param<dataframe2>: a pandas.DataFrame

def azureml_main(dataframe1 = None, dataframe2 = None):

# Execution logic goes here
print('Input pandas.DataFrame #1:\r\n\r\n{0}'.format(dataframe1))
dataframe1['sentences']= HTMLSentenceTokenizer().feed(dataframe1['FullEmailContent'])
# If a zip file is connected to the third input port is connected,
# it is unzipped under ".\Script Bundle". This directory is added
# to sys.path. Therefore, if your zip file contains a Python file
# mymodule.py you can import it using:
# import mymodule

# Return value must be of a sequence of pandas.DataFrame
return dataframe1
Azure Machine Learning
Azure Machine Learning
An Azure machine learning service for building and deploying models.
3,332 questions
{count} votes

Your answer

Answers can be marked as Accepted Answers by the question author, which helps users to know the answer solved the author's problem.