The explanation is a bit long, but most of this is to give some background regarding my problem (the problem is probably not Python related, but having extra information wont hurt):
I am currently working on a django application. The application window (browser) has two iframes each taking 50% of the screen. On the left-hand side there will be snopes (fact-check website) pages displayed and on the right-hand side one of the pages linked in that specific snopes article will be displayed.
A form in the bottom of the app will let the user choose and post whether the RHS page is a source of the claim in the snopes article or not (there is also “invalid input” or “I dont know”).
Submitting calls a function which tries to get the other links of the current snopes page, and if there is none, get any pages annotated (in this priority) twice, once, or least (so 0 then 3 then 4 …..). This is done by using a count.csv which simple stores how many each page+link combination has been annotated (as snopes articles can repeat and so can linked sites).
The header of count.csv is :
page source_url count
The pages to be displayed one either side are retrieved from a csv with the following header:
page claim verdict tags date author source_list source_url
And the user input is stored in a separate csv for each user inside a results
directory with the header:
page claim verdict tags date author source_list source_url value name
With value being 1 (yes), 2 (no), 3 (invalid input), 4 (dont know)
The html of all the links in the first csv (called samples.csv) is retrieved in advance and stored using the article name as the directory name. The page itself is stored as “page.html” and the sources are stored as “some_number.html” where some_number is the index of the source in the source_list.
For example, the html of the first link in a snopes article named “is-water-wet” will be
Annotator/annotator/data/html_snopes/is-water-wet/0.html
manage.py is in Annotator
After getting a row from samples (a pandas dataframe created using samples.csv). my Django app gets all of the rows with the same page and automatically annotates the rows without a corresponding path as 3 (invalid input) as that means that html retrieval failed.
When I ran the app on a virtual machine, I noticed a major issue. When I log in (to the app) with a user and annotate, the corresponding results
csv for some reason goes from 316kb to ~3gb and back once the app is terminated, even though the csv has only around 248 lines.
I checked the first couple of lines (of the results csv) and they look completely normal.
Here is the code:
def get_done_by_annotator(name): # creates a list of pages that have been already annotated by the current annotator results_filename = results_path+name+".csv" if os.path.exists(results_filename): results = pd.read_csv(results_filename, sep=',', encoding="latin1") done_by_annotator = (results["page"]+results["source_url"]).unique() else: done_by_annotator = [] return done_by_annotator
def get_count_file(s_p): #Creates or reads countfile: if os.path.exists(count_path): count_file = pd.read_csv(count_path, sep=',', encoding="latin1").sample(frac=1) else: count_file = s_p[['page','source_url']].copy() count_file['count'] = 0 count_file.to_csv(count_path, sep=',', index=False) return count_file
def increase_page_annotation_count(page, origin): count_file = pd.read_csv(count_path, sep=',', encoding="latin1") count_file.loc[(count_file['page'] == page) & (count_file['source_url'] == origin), 'count'] += 1 count_file.to_csv(count_path, sep=',', index=False)
def save_annotation(page, origin, value, name): # Read samples file print("SAVING ANNOTATION") s_p = pd.read_csv(samples_path, sep='\t', encoding="latin1") entry = s_p.loc[(s_p["page"] == page) & (s_p["source_url"] == origin)] if not (entry.empty): n_entry = entry.values.tolist()[0] n_entry.extend([value, name]) results_filename = results_path+name+".csv" if os.path.exists(results_filename): results = pd.read_csv(results_filename, sep=',', encoding="latin1") else: results = pd.DataFrame(columns=res_header) oldEntry = results.loc[(results["page"] == page) & (results["source_url"] == origin)] if oldEntry.empty: results.loc[len(results)] = n_entry results.to_csv(results_filename, sep=',', index=False) # keeps track of how many times page was annotated increase_page_annotation_count(page, origin)
def get_least_annotated_page(name,aPage=None): done_by_annotator = get_done_by_annotator(name) #Print number of annotated pages and total number of pages s_p = pd.read_csv(samples_path, sep='\t', encoding="latin1") print("done: ", len(done_by_annotator), " | total: ", len(s_p)) if len(done_by_annotator) == len(s_p): return "Last annotation done! Thank you!", None, None, None, None, None, None, None #Creates or reads countfile: count_file = get_count_file(s_p) #Get pages not done by current annotator not_done_count = count_file.loc[~(count_file['page']+count_file['source_url']).isin(done_by_annotator)] print(">>",aPage) if aPage is not None: remOrigins = not_done_count.loc[not_done_count['page'] == aPage] if len(remOrigins)==0: return get_least_annotated_page(name) else: twice_annotated = not_done_count.loc[not_done_count['count'] == 2] if len(twice_annotated) > 0: page = twice_annotated.iloc[0]['page'] else: once_annotated = not_done_count.loc[not_done_count['count'] == 1] if len(once_annotated) > 0: page = once_annotated.iloc[0]['page'] else: index = not_done_count['count'].idxmin(axis=0, skipna=True) page = not_done_count.loc[index]['page'] remOrigins = not_done_count.loc[not_done_count['page'] == page] page = remOrigins.iloc[0].page #Automatically annotate broken links of this page as invalid input (op = 3) src_lst = s_p.loc[s_p['page'] == page] src_lst = ast.literal_eval(src_lst.iloc[0].source_list) for idx, e in remOrigins.iterrows(): src_idx_num = src_lst.index(e.source_url) if not (os.path.exists(snopes_path+(e.page.strip("/").split("/")[-1]+"/")+str(src_idx_num)+".html")): save_annotation(e.page, e.source_url, "3", name) #Update done_by_annotator, count_file, and not_done_count done_by_annotator = get_done_by_annotator(name) count_file = get_count_file(s_p) not_done_count = count_file.loc[~(count_file['page']+count_file['source_url']).isin(done_by_annotator)] remOrigins = not_done_count.loc[not_done_count['page'] == page] if len(remOrigins)==0: return get_least_annotated_page(name) entry = remOrigins.iloc[0] entry = s_p[(s_p.page.isin([entry.page]) & s_p.source_url.isin([entry.source_url]))].iloc[0] a_page = entry.page.strip() o_page = entry.source_url.strip() src_lst = entry.source_list.strip() a_page_path = a_page.strip("/").split("/")[-1]+"/" src_idx_num = src_lst.index(o_page) o_page_path = a_page_path+str(src_idx_num)+".html" f = codecs.open(snopes_path+a_page_path+"page.html", encoding='utf-8') a_html = bs(f.read(),"lxml") f = codecs.open(snopes_path+o_page_path, encoding='utf-8') o_html = bs(f.read(),"lxml") return a_page, o_page, str(a_html), str(o_html), src_lst, a_done, a_total, len(done_by_annotator)