Source code for mapache.parseutils

import mapache

from bs4 import BeautifulSoup
import urllib.request

from dateutil.parser import parse

try:
    from tqdm import tqdm
except:
    def tqdm(*args, **kwargs):
        if args:
            return args[0]
        return kwargs.get('iterable', None)


[docs]def poll_from_table(table, date_column, party_columns, name=None, party_names=None, error_column=None, pollster_column=None, poll_rows=None): # TODO deal with multicolumns # TODO: report errors correctly if not poll_rows: # TODO, is it a good guess? poll_rows = (2, -0) row = wikitable_get_rows(table)[0] cells = wikitable_get_cells(row) # TODO if wrong columns in args? if error_column is not None: polls = mapache.PollsList(name) if not party_names: # Try to extract the names from the first row... party_names = [] row = wikitable_get_rows(table)[0] cells = wikitable_get_cells(row) for c in cells[party_columns[0]:party_columns[1]]: _, party_name = wikitable_get_url(c) if name is None: # TODO return None party_names.append(party_name) rows = wikitable_get_rows(table)[poll_rows[0]: poll_rows[1]] for row in tqdm(rows): cells = wikitable_get_cells(row) if len(cells) <= 3: # TODO ?? continue pollster = None if pollster_column is not None: pollster = cells[pollster_column].text err = None if error_column is not None: err = cells[-3].text if err: err = float(err[1:].split(' ')[0]) date = cells[date_column].text date = parse(date) votes = {} for i, p in enumerate(cells[party_columns[0]: party_columns[1]]): v = p.text if v: try: v = float(v) except: # Multiple parties in the cell, fix? continue votes[party_names[i]] = v poll = mapache.Poll(votes, date, pollster, err) polls.add(poll) return polls
def wikitable_get_rows(table): return table.findAll("tr") def wikitable_get_cells(row): return row.findAll("th") + row.findAll("td") def wikitable_get_url(c): a = c.find("a") url, name = None, None if a: url = "http://wikipedia.org" + a.attrs["href"] name = a.attrs["title"] return url, name def wikitable_get_imgurl(c): img = c.find("img") imgurl = None if img: imgurl = "http:" + img.attrs["src"] return imgurl def tables_from_wiki(url): # TODO Add title of the section to identify the table? page = urllib.request.urlopen(url) soup = BeautifulSoup(page, "html.parser") tables = soup.findAll("table", class_="wikitable") return tables def party_from_wiki(url, name=None): # The party wiki page is fetched to get the full name and full logo page = urllib.request.urlopen(url) party_soup = BeautifulSoup(page, "html.parser") infobox = party_soup.find("table", {"class": "infobox vcard"}) logo = "http:" + infobox.find("td", {"class": "logo"}) logo = logo.find("img").attrs["src"] # If both the English and Spanish name are present the Spanish one # has class "nickname", otherway it has "fn org" full_name = infobox.find("span", {"class": "nickname"}) if full_name: full_name = full_name.text else: full_name = infobox.find("span", {"class": "fn org"}).text if not name: name = full_name short_name = None abbreviation = infobox.find("td", {"class": "nickname"}) if abbreviation: short_name = abbreviation.text # With the data obtained from the wiki a new party is created and added # to the party set party = mapache.Party(name, logo_url=logo, short_name=short_name, # The abbreviation/short name will be created automatically # if the name provided is too long and short_name=None full_name=full_name) return party