GIT repositories brewerymap / master parse_addresses.py
master

Tree @master (Download .tar.gz)

parse_addresses.py @masterraw · history · blame

"""Parse brewery addresses from html-file"""
import codecs
import re
from html.parser import HTMLParser



def add_space(string):
    """Add space before capital letter"""
    return re.sub(r"(\w)([A-Z])", r"\1 \2", string)

HTMFILE_PATH = "German-Breweries_table"
OUTPUT_FILE_PATH = "brewerylist"


def parse_map(url):
    """Parse address from url"""
    words = url.split('/')
    if len(words) < 6:
        # print("could not parse url: " + url)
        # print("")
        return [None, None, None, None]
        print("")
    if len(words) > 8:
        # print("could not parse url: " + url)
        return [None, None, None, None]
    zip_ = words[4].split('-')[0]
    if len(words[4].split('-')) < 2:
        city = words[5]
        road = " ".join(words[6].split('-')[:-1])
        number = words[6].split('-')[-1]
    else:
        city = " ".join(words[4].split('-')[1:])
        road = " ".join(words[5].split('-')[:-1])
        number = words[5].split('-')[-1]

    return [zip_, city, road, number]





class RowParser(HTMLParser):
    """Parse one row of table"""
    data = []

    def print_row(self):
        """Print row"""
        number = None
        street = None
        city = None
        zipcode = None
        name = None
        last_url = "unknown"
        url = "unknown"
        color = None
        places = []

        if self.data[0] in ("PLACE", "DISTRICT"):
            self.data = []
            return []

        # DEBUG
        # print(self.data)

        is_two_part = False
        last_cell = ""
        last_cell_is_name = False

        for cell in self.data[1:]:

            if "http" in cell:
                last_cell_is_name = False
                if "goyellow.de" in cell:
                    zipcode, city, street, number = parse_map(cell)
                else:
                    if is_two_part:
                        url = url + ", " + cell
                    else:
                        url = cell
            elif cell in ['map', 'tap', 'tap1', 'tap2']:
                last_cell_is_name = False
                color = cell
                if cell in ['tap', 'tap1', 'tap2']:
                    url = last_url
                if number is None or city is None or street is None or zipcode is None:
                    # print(self.data)
                    print("")
                    # print("")
                else:
                    places.append("{} {}, {}, Germany;{};{};{}".format(number,
                                                                street,
                                                                city,
                                                                name,
                                                                url,
                                                                color)
                                 )
                number = None
                street = None
                city = None
                zipcode = None
                last_url = url
                url = "unknown"
                color = None
                is_two_part = False
            elif 'N, ' in cell and 'E' in cell:
                last_cell_is_name = False
                places.append("{};{};{};{}".format(cell, name, url, "map"))
                number = None
                street = None
                city = None
                zipcode = None
                last_url = url
                url = "unknown"
                color = None
                is_two_part = False
                last_cell = 'tap'
                continue
            else:
                if last_cell in ['map', 'tap', 'tap1', 'tap2']:
                    last_cell = cell
                    continue
                # if cell == " ":
                    # continue
                if cell == ' ':
                    continue
                if "not currently brewing" in cell:
                    continue
                if "(new" in cell:
                    continue
                if "Production to move" in cell:
                    continue
                if cell == 'Closed?':
                    last_cell = 'tap'
                    continue
                if '(formerly ' in cell:
                    last_cell = 'tap'
                    continue
                if cell == 'C':
                    last_cell = 'tap'
                    continue
                if cell in ['/', '-', '/ ', '&', ' /']:
                    is_two_part = True
                if is_two_part:
                    name = name + cell
                    last_cell_is_name = True
                    last_cell = cell
                    continue
                if last_cell_is_name:
                    if cell not in ['\xa0', ' ']:
                        print("last cell: "  + last_cell)
                        print("Invalid cell after breweryname: " + cell)
                        print("")
                        print(self.data)
                        print("")
                else:
                    name = add_space(cell)
                    last_cell_is_name = True
            last_cell = cell

        self.data = []
        return places




    def handle_data(self, data):
        self.data.append(data)

    def handle_starttag(self, tag, attrs):
        for attr in attrs:
            if attr[0] in ['href', 'HREF']:
                self.data.append(attr[1])





def parse_file(file_path):
    """Parse file"""
    places = []
    rowparser = RowParser()
    with codecs.open(file_path, 'r', 'UTF-8') as file:
        for line in file:
            row = ""
            if "<tr>" in line:
                row = row + line.strip()
                while "</tr>" not in line:
                    line = file.readline()
                    if "www.aida.de" in line:
                        break
                    row = row + line.strip()
                rowparser.feed(row)
                new_place = rowparser.print_row()
                places = places + new_place

    with open(OUTPUT_FILE_PATH , 'w') as outfile:
        for place in places:
            outfile.writelines(place+'\n')




parse_file(HTMFILE_PATH)