# This Python file uses the following encoding: utf-8 ''' Created on Jan 30, 2021 @author: lukelindemann Wikipedia metadata: https://en.wikipedia.org/wiki/Help:Cheatsheet ''' import re #from bz2 import BZ2File as bzopen from os import listdir from bz2file import BZ2File as bzopen # For Multistream #from Carbon.Aliases import false def open_bz2(filename, line_limit = 100000): # reading a bz2 archive with bzopen(filename, "r") as bzfin: lines = [] for i, line in enumerate(bzfin): if i == line_limit: break lines.append(line) bzfin.close() return lines def del_wiki_tab(page): lev = 0 clean_page = '' for i in range(0,len(page)): # Current char char = page[i] # Previous char if i >= 1: pchar = page[i-1] else: pchar = '' # Previous previous char if i >= 2: ppchar = page[i-2] else: pchar= '' # Next char if i < len(page)-1: nchar = page[i+1] else: nchar = '' # Going in a level: {| if (char == '{') and (nchar == '|'): #print 'Going in' #print page[i:i+50] lev += 1 # Going out a level: |} if (pchar == '}') and (ppchar == '|'): #print 'Coming out' #print page[i-2:i+50] lev -= 1 # Failsafe for unbalanced tags: if there's a double linebreak assume the table is closed #if (lev != 0) and (char == '\n') and (nchar == '\n'): # lev = 0 # Write if outside a table if lev == 0: clean_page += char return clean_page def del_wiki_temp(page): # Must be run after del_wiki_table!!!!! # Deletes everything between nested {{tags}}, including templates, # citation tags, math formulas, infoboxes lev = 0 clean_page = '' collection = '' onCollect = False page = re.sub('{{','{>', page) page = re.sub('}}','<}', page) for i in range(0,len(page)): # Current char char = page[i] # Previous char if i >= 1: pchar = page[i-1] else: pchar = '' # Previous previous char if i >= 2: ppchar = page[i-2] else: ppchar= '' # Next char if i < len(page)-1: nchar = page[i+1] else: nchar = '' # Going in a level: {{ if (char == '{') and (nchar == '>'): if lev == 0: onCollect = True lev += 1 # Going out a level: }} if (pchar == '}') and (ppchar == '<'): lev -= 1 if lev == 0: #print collection #print '\n\n\n' onCollect = False collection = '' # Write if outside a table if lev == 0: clean_page += char if onCollect: collection += char return clean_page def wiki_clean(page): page_clean = page # Delete Tables and Templates page_clean = del_wiki_temp(page_clean) page_clean = del_wiki_tab(page_clean) #p1_clean = re.sub('', '', p1_clean, flags=re.DOTALL) #p1_clean = re.sub('<(.*?)>', '', p1_clean, flags=re.DOTALL) #p1_clean = re.sub('\n[\{\[=\*].*', '', p1_clean) #p1_clean = re.sub('^[\{\[=\*].*', '', p1_clean) #p1_clean = re.sub("'''", "", p1_clean) #p1_clean = re.sub('\n.*[_=|&].*', '', p1_clean) #p1_clean = re.sub('^.*[_=|&].*', '', p1_clean) #p1_clean = re.sub('[\[\]]', '', p1_clean) # Delete any line that contains : # Note: These are XML tags, not wysinwyg Wiki Markup tags page_clean = re.sub(r'[\t ]*', '', page_clean) page_clean = re.sub(r'', '', page_clean) page_clean = re.sub(r'^.*[<>].*$', r'', page_clean, flags=re.MULTILINE) # Any line with # Delete ''Italic'' and '''Bold''' formatting page_clean = re.sub(r"'{2,}", r'', page_clean) # Links: # Delete doubly embedded links (links within file captions) e.g. [[File:Wiki.png|thumb| [[Caption1]] and [[Caption 2]] ]] page_clean = re.sub(r'\[\[[^\[\]]*(\[\[[^\[\]]*\]\][^\[\]]*)+[^\[\]]*\]\]', r'', page_clean) # Delete if it has a colon -> [[Category:Category Name]] [[File:Wiki.png|thumb|Caption]] page_clean = re.sub(r'\[\[[^\]]*:[^\]]*\]\]', r'', page_clean) # If piped, take the second element -> [[FileName|FileAlias]] page_clean = re.sub(r'\[\[[^\]]+\|([^\]]+)\]\]', r'\1', page_clean) # Links of the type: [[ LinkName | LinkAlias ]] -> LinkAlias # Otherwise, just remove double brackets page_clean = re.sub(r'(\[\[)|(\]\])', r'', page_clean) # Delete website link and description of form [http://www.link.com description] page_clean = re.sub(r'\[(\w+):\/\/(.*?)(( (.*?))|())\]', r'', page_clean) # Delete bare website links page_clean = re.sub(r'(\w+):\/\/(.*?)( |$)', r'', page_clean, flags=re.MULTILINE) # References: Delete everything between OR tags page_clean = re.sub(r'<ref.*?((/ref>)|(/>))', '', page_clean, flags=re.DOTALL) # Delete anything between Wiki's < and > tags # Delete everything between tags: OR tags # Math, imagemap, gallery... page_clean = re.sub(r'<math.*?((/math>)|(/>))', '', page_clean, flags=re.DOTALL) # Delete anything between Wiki's < and > tags page_clean = re.sub(r'<imagemap.*?((/imagemap>)|(/>))', '', page_clean, flags=re.DOTALL) # Delete anything between Wiki's < and > tags page_clean = re.sub(r'<mapframe.*?((/mapframe>)|(/>))', '', page_clean, flags=re.DOTALL) # Delete anything between Wiki's < and > tags page_clean = re.sub(r'<gallery.*?((/gallery>)|(/>))', '', page_clean, flags=re.DOTALL) # Delete anything between Wiki's < and > tags page_clean = re.sub(r'<inputbox.*?((/inputbox>)|(/>))', '', page_clean, flags=re.DOTALL) # Delete anything between Wiki's < and > tags page_clean = re.sub(r'<timeline.*?((/timeline>)|(/>))', '', page_clean, flags=re.DOTALL) # Delete anything between Wiki's < and > tags page_clean = re.sub(r'<center.*?((/center>)|(/>))', '', page_clean, flags=re.DOTALL) # Delete anything between Wiki's < and > tags page_clean = re.sub(r'<table.*?/table>', '', page_clean, flags=re.DOTALL) # Delete anything between Wiki's < and > tags page_clean = re.sub(r'<td.*?/td>', '', page_clean, flags=re.DOTALL) # Delete anything between Wiki's < and > tags page_clean = re.sub(r'<tr.*?/tr>', '', page_clean, flags=re.DOTALL) # Delete anything between Wiki's < and > tags # Comments, Other Wiki : Delete within tags: page_clean = re.sub(r'<.*?>', '', page_clean, flags=re.DOTALL) # Delete anything between Wiki's < and > tags # Delete full lines that start with Headings (==Heading==), Lists (*X, ;X, #X), #REDIRECT, & and parts of tables (!,|,}) page_clean = re.sub(r'^[\*#&\-=;!¹²³°†:\|\}].*$', r'', page_clean, flags=re.MULTILINE) # Line starts with =,*,#,&,-,:(indent),¹²³°†(footnotes) # Delete Table of Content Metadata: page_clean = re.sub(r'(__FORCETOC__)|(__TOC__)|(__NOTOC__)|(__NOEDITSECTION__)', r'', page_clean) # Turn non-breaking spaces into spaces: page_clean = re.sub(r'&?nbsp;', r' ', page_clean) # Delete special character codes like ã: page_clean = re.sub(r'&\w+;', r'', page_clean) # Delete lines that start with a numeral (almost always lists or timelines) page_clean = re.sub(r'^\d.*$', r'', page_clean, flags=re.MULTILINE) # Delete words separated by two or more dots or dashes on a single line: A - B - C page_clean = re.sub(r'^([\w\s]+ [-·•—|]+ ){2,}.*$', r'', page_clean, flags=re.MULTILINE) # Delete a line consisting of a single word: page_clean = re.sub(r'^\w+[ :-]?$', r'', page_clean, flags=re.MULTILINE) # Delete multiple line breaks page_clean = re.sub(r'\n{3,}', r'\n\n', page_clean) return page_clean ### Wikipedia scraper: takes a bz2 dump file and writes to a given file def make_wiki(file_to_read, file_to_write, allowed_namespaces=['0'], max_line_read=float('inf'), min_article_len=0, max_article_count=float('inf'), max_word_count=float('inf')): full = open_bz2(file_to_read, line_limit=max_line_read) # Start and end indices of every page init_ind = [i for i, item in enumerate(full) if re.search('(.*)', p).group(1) namespace = re.search('(.*)', p).group(1) if namespace in allowed_namespaces: #print title #print p p = wiki_clean(p) article_len = len(p.split()) if article_len >= min_article_len: if article_count + 1 <= max_article_count: if word_count <= max_word_count: word_count += article_len article_count += 1 f = open(file_to_write, "a") f.write('#Article ' + str(article_count) + ': ' + title + ' (' + str(article_len) + ' words)\n') f.write(p) f.write('\n\n\n') f.close() if article_count == max_article_count: break if word_count > max_word_count: word_count -= article_len article_count -= 1 break f = open(file_to_write, "a") f.write('#Total Article count: ' + str(article_count) + '\n') f.write('#Total Word count: ' + str(word_count)) f.close() #### MAIN CODE ### in_dir = '/Volumes/LukeDisk2/WikiTexts/test' # Place the filepath to a folder containing Wikipedia Dump Files here #in_dir = '' # For Make_wiki below: # Allowed Namespaces: which Wikipedia article types ('0' for articles) # Maximum Lines to Read: for big bz2 files, restrict to a certain number of lines to go quicker # Minimum Article Length: Exclude articles that contain than a certain number of words # Maximum Article Count: Include only the first X number of articles # Maximum Word Count: Stop running before reaching a certain maximum number of words for f in listdir(in_dir): if str(f).endswith('bz2'): print(f) a = in_dir + '/' + f b = in_dir + '/' + re.sub(r'\-.*$', '', f) make_wiki(a, b, allowed_namespaces=['0'], max_line_read=20000000, min_article_len=100, max_article_count=float('inf'), max_word_count=200000)