"""Parse brewery addresses from html-file"""
import codecs
import re
from html.parser import HTMLParser
def add_space(string):
"""Add space before capital letter"""
return re.sub(r"(\w)([A-Z])", r"\1 \2", string)
HTMFILE_PATH = "German-Breweries_table"
OUTPUT_FILE_PATH = "brewerylist"
def parse_map(url):
"""Parse address from url"""
words = url.split('/')
if len(words) < 6:
# print("could not parse url: " + url)
# print("")
return [None, None, None, None]
print("")
if len(words) > 8:
# print("could not parse url: " + url)
return [None, None, None, None]
zip_ = words[4].split('-')[0]
if len(words[4].split('-')) < 2:
city = words[5]
road = " ".join(words[6].split('-')[:-1])
number = words[6].split('-')[-1]
else:
city = " ".join(words[4].split('-')[1:])
road = " ".join(words[5].split('-')[:-1])
number = words[5].split('-')[-1]
return [zip_, city, road, number]
class RowParser(HTMLParser):
"""Parse one row of table"""
data = []
def print_row(self):
"""Print row"""
number = None
street = None
city = None
zipcode = None
name = None
last_url = "unknown"
url = "unknown"
color = None
places = []
if self.data[0] in ("PLACE", "DISTRICT"):
self.data = []
return []
# DEBUG
# print(self.data)
is_two_part = False
last_cell = ""
last_cell_is_name = False
for cell in self.data[1:]:
if "http" in cell:
last_cell_is_name = False
if "goyellow.de" in cell:
zipcode, city, street, number = parse_map(cell)
else:
if is_two_part:
url = url + ", " + cell
else:
url = cell
elif cell in ['map', 'tap', 'tap1', 'tap2']:
last_cell_is_name = False
color = cell
if cell in ['tap', 'tap1', 'tap2']:
url = last_url
if number is None or city is None or street is None or zipcode is None:
# print(self.data)
print("")
# print("")
else:
places.append("{} {}, {}, Germany;{};{};{}".format(number,
street,
city,
name,
url,
color)
)
number = None
street = None
city = None
zipcode = None
last_url = url
url = "unknown"
color = None
is_two_part = False
elif 'N, ' in cell and 'E' in cell:
last_cell_is_name = False
places.append("{};{};{};{}".format(cell, name, url, "map"))
number = None
street = None
city = None
zipcode = None
last_url = url
url = "unknown"
color = None
is_two_part = False
last_cell = 'tap'
continue
else:
if last_cell in ['map', 'tap', 'tap1', 'tap2']:
last_cell = cell
continue
# if cell == " ":
# continue
if cell == ' ':
continue
if "not currently brewing" in cell:
continue
if "(new" in cell:
continue
if "Production to move" in cell:
continue
if cell == 'Closed?':
last_cell = 'tap'
continue
if '(formerly ' in cell:
last_cell = 'tap'
continue
if cell == 'C':
last_cell = 'tap'
continue
if cell in ['/', '-', '/ ', '&', ' /']:
is_two_part = True
if is_two_part:
name = name + cell
last_cell_is_name = True
last_cell = cell
continue
if last_cell_is_name:
if cell not in ['\xa0', ' ']:
print("last cell: " + last_cell)
print("Invalid cell after breweryname: " + cell)
print("")
print(self.data)
print("")
else:
name = add_space(cell)
last_cell_is_name = True
last_cell = cell
self.data = []
return places
def handle_data(self, data):
self.data.append(data)
def handle_starttag(self, tag, attrs):
for attr in attrs:
if attr[0] in ['href', 'HREF']:
self.data.append(attr[1])
def parse_file(file_path):
"""Parse file"""
places = []
rowparser = RowParser()
with codecs.open(file_path, 'r', 'UTF-8') as file:
for line in file:
row = ""
if "<tr>" in line:
row = row + line.strip()
while "</tr>" not in line:
line = file.readline()
if "www.aida.de" in line:
break
row = row + line.strip()
rowparser.feed(row)
new_place = rowparser.print_row()
places = places + new_place
with open(OUTPUT_FILE_PATH , 'w') as outfile:
for place in places:
outfile.writelines(place+'\n')
parse_file(HTMFILE_PATH)