Source code for jBScraper

import sys
import requests
import time
from bs4 import BeautifulSoup

# Scrapes Data for https://www.oejv.com/bundesliga/ergebnissescores
# Disclaimer: there are not many tests for null obejects, ...
[docs]class JBScraper: # def __inf__(self): """ creates an JBScraper Object """ pass
[docs] def scrapeJB(self): """ Scrapes Data from https://www.oejv.com/bundesliga/ergebnissescores and stores the data collected in a .txt file """ ##################################################################### url = "https://www.oejv.com/bundesliga/ergebnissescores/" placeToStoreResults = '../results/' startYear = 2011 endYear = 2018 ##################################################################### year = 2011 # for year in range(startYear, endYear): # TODO activate # Preparing Files nameBegegnungen = placeToStoreResults + 'Begegnungen' + str(year) + '.txt' # Name of text file coerced with +.txt fileBegegnungen = open(nameBegegnungen, 'w') # ATTENTION: overwrites file fileBegegnungen.writelines('id; nameheim; winFirstHalfH; UBWFirstHalfH; winEndH; UBWEndH; nameauswaerts; winFirstHalfA;' ' UBWFirstHalfA; winEndA; UBWEndA\n') nameEinzelKaempfe = placeToStoreResults + 'EinzelKaempfe' + str(year) + '.txt' # Name of text file coerced with +.txt fileEinzelKaempfe = open(nameEinzelKaempfe, 'w') # ATTENTION: overwrites file fileEinzelKaempfe.writelines('id; fightNo; GivennameFirstnameH; YukoH; WazzariH; IpponH; ShidoH; HSMH; WinH; UBWH;' 'GivennameFirstnameA; YukoA; WazzariA; IpponA; ShidoA; HSMA; WinA; UBWA\n') data = {'jama_saison': year} r = requests.post('https://www.oejv.com/bundesliga/ergebnissescores/', data=data) soupFightsOverview = BeautifulSoup(r.content, 'html.parser') # Where results are saved urlOfFights = [] links = soupFightsOverview.find_all(onclick="openwin(this.href); return false") # 115 Elements # TODO Find Roundname (Runde 1, Runde 2, ...) here (siblings) idOfFights2 = [] for link in links: idOfFights2.append(link.get("href")) idOfFights = idOfFights2[1:][::2] # Url of fights for link in idOfFights: urlOfFights.append(url + link) # Get first element # TODO muss noch entscheiden bis wohin die erste liga geht for urlOfFightUnicode in urlOfFights: urlOfFight = urlOfFightUnicode.encode('ascii', 'ignore') rFight = requests.get(urlOfFight) htmlFight = rFight.content soupSpecificFight = BeautifulSoup(htmlFight, 'html.parser') begegnungenId = urlOfFight[65:69] # FILE ERSTELLEN Begegnung hinzufuegen nameBegegnungen = placeToStoreResults + 'Begegnungen' + str(year) + '.txt' # Name of text file coerced with +.txt fileBegegnungen = open(nameBegegnungen, 'a') # ATTENTION: overwrites file print(nameBegegnungen) if not soupSpecificFight.find_all(colspan='7'): nameheim = soupSpecificFight.find_all(colspan='8')[0] else: nameheim = soupSpecificFight.find_all(colspan='7')[0] nameheimStr = nameheim.contents[0][7:] # size(HEIM * )== 7 labels = soupSpecificFight.find_all(bgcolor="#bbbfbb") # Find all fights if year == 2011: # TODO Zwischenstand richtig berechen bei 2011 zwischenstand = labels[2].find_all(bgcolor="#57a8f7") endstand = labels[2].find_all(bgcolor="#57a8f7") else: zwischenstand = labels[2].find_all(bgcolor="#57a8f7") endstand = labels[3].find_all(bgcolor="#57a8f7") # TODO why does this work? if not soupSpecificFight.find_all(colspan='7'): nameauswaerts = soupSpecificFight.find_all(colspan='8')[1] elif len(soupSpecificFight.find_all(colspan='7')) > 1: nameauswaerts = soupSpecificFight.find_all(colspan='7')[1] else: nameauswaerts = soupSpecificFight.find_all(colspan='8')[0] nameauswaertsStr = nameauswaerts.contents[0][11:] # size(AUSWAERTS * )== 11 kampfzeile = begegnungenId.decode("utf-8") \ + '; ' + nameheimStr \ + '; ' + zwischenstand[0].contents[0] \ + '; ' + zwischenstand[1].contents[0] \ + '; ' + endstand[0].contents[0] \ + '; ' + endstand[1].contents[0] \ + '; ' + nameauswaertsStr \ + '; ' + zwischenstand[2].contents[0] \ + '; ' + zwischenstand[3].contents[0] \ + '; ' + endstand[2].contents[0] \ + '; ' + endstand[3].contents[0] \ + '\n' fileBegegnungen.writelines(kampfzeile.encode("utf-8")) # EinzelKaempfe hinzufuegen nameEinzelKaempfe = placeToStoreResults + 'EinzelKaempfe' + str(year) + '.txt' fileEinzelKaempfe = open(nameEinzelKaempfe, 'a') # ATTENTION: adds to file einzelkaempfe = soupSpecificFight.find_all(bgcolor="#d5d5d5") # Find all fights j = 0 kampf = '' # loops over each fight for einzelkampf in einzelkaempfe: j = j + 1 info = einzelkampf.find_all('td') kampf += begegnungenId.decode("utf-8") + '; ' + str(j) numberOfColumns = len(info) maxlength = 18 withYuko = numberOfColumns == maxlength for i in range(2, numberOfColumns): if (not withYuko) and (i == 3 or i == 10): kampf += '; -1' kampf += '; ' + (info[i].contents[0] if len(info[i]) > 0 else '_________') kampf += '\n' # writes data into file fileEinzelKaempfe.write(kampf.encode("utf-8")) time.sleep(1) kampf = '' fileBegegnungen.close() fileEinzelKaempfe.close()
if __name__ == "__main__": jBScraper = JBScraper() jBScraper.scrapeJB()