💾 Archived View for gmn.clttr.info › sources › geminispace.git › tree › scripts › build_threads.py.t… captured on 2022-06-11 at 23:40:45.
⬅️ Previous capture (2021-12-03)
-=-=-=-=-=-=-
from peewee import JOIN from gus import constants from gus.lib.db_model import init_db, Link, Page, Thread, ThreadPage from gus.lib.gemini import GeminiResource collapsible_log_variations = [ ("gemini://gemini.circumlunar.space/~solderpunk/gemlog/", "gemini://gemini.circumlunar.space/~solderpunk/3albums/"), ("gemini://gemini.circumlunar.space/~solderpunk/gemlog/", "gemini://gemini.circumlunar.space/~solderpunk/hitenheroes/"), ("gemini://gemini.circumlunar.space/~solderpunk/gemlog/", "gemini://gemini.circumlunar.space/~solderpunk/cornedbeef/"), ("gemini://gemini.circumlunar.space/~", "gemini://gemini.circumlunar.space/users/"), ("gemini://cetacean.club", "gemini://maj.kahless.cetacean.club"), ] def find_thread_tops(resource, first_seen, page_id, content, current_chain=[]): """ This function will recursively walk up to the tops of all threads a given page belongs to, then call recurse_thread on each of them to actually build the full threads. """ for collapsible in collapsible_log_variations: if resource.normalized_url.startswith(collapsible[1]): resource = GeminiResource(collapsible[0] + resource.fetchable_url[len(collapsible[1]):]) break u = resource.indexable_url.rstrip("/") parent_pages_query = Page.raw("""SELECT p_to.*, l.is_cross_host_like, MIN(c.timestamp) AS first_seen FROM page AS p_from JOIN indexable_crawl AS ic ON ic.page_id == p_to.id JOIN crawl AS c ON c.page_id == p_to.id JOIN link as l ON l.from_page_id == p_from.id JOIN page as p_to ON p_to.id == l.to_page_id WHERE p_from.url IN (?, ?) AND p_to.normalized_url != ? AND c.status == 20 AND p_to.content_type LIKE 'text/%' GROUP BY p_to.normalized_url ORDER BY l.is_cross_host_like, p_to.url ASC""", u, f"{u}/", resource.normalized_url) found_threadable_parents = False for parent_page in parent_pages_query.iterator(): parent_resource = GeminiResource(parent_page.fetchable_url) for collapsible in collapsible_log_variations: if resource.normalized_url.startswith(collapsible[1]): parent_resource = GeminiResource(collapsible[0] + resource.fetchable_url[len(collapsible[1]):]) break # Skip any parents that are already in the list of seen resources for this call # stack - it means they're circular linking if any(r for r in current_chain if r.normalized_url == resource.normalized_url): continue if is_threadable_link(resource, parent_resource, parent_page.is_cross_host_like): found_threadable_parents = True find_thread_tops( parent_resource, parent_page.first_seen, parent_page.id, parent_page.content, current_chain + [resource]) if not found_threadable_parents: # return early if thread top already processed try: query = ThreadPage.select().join(Page).where(Page.url == resource.indexable_url, ThreadPage.address == "001") query.get() print(f"\nAlready done: {resource.fetchable_url}") return except ThreadPage.DoesNotExist: pass full_thread = recurse_thread(resource, "001", first_seen, page_id, content) # Deduplicate full_thread.reverse() i = 0 while i < len(full_thread): if any(x for x in full_thread[i+1:] if x[0].normalized_url == full_thread[i][0].normalized_url): full_thread.pop(i) else: i += 1 full_thread.reverse() thread_updated_at = max(m[2] for m in full_thread) thread = Thread.create(updated_at=thread_updated_at) print() for m in full_thread: ThreadPage.create( thread=thread, page_id=m[3], address=m[1], friendly_author=m[0].get_friendly_author(m[4]), friendly_title=m[0].get_friendly_title(m[4]), ) print(" -> [{:<19}] [{}] {}".format(m[1], m[2], m[0].fetchable_url)) def recurse_thread(resource, path, first_seen, page_id, content, current_chain=[]): if not resource.is_valid or not resource.is_log_post_like: # if not resource.is_valid: return [] u = resource.indexable_url.rstrip("/") from_urls = [ u, f"{u}/", ] for collapsible in collapsible_log_variations: if resource.normalized_url.startswith(collapsible[1]): new_u = collapsible[0] + resource.indexable_url[len(collapsible[1]):] from_urls.extend([new_u, f"{new_u}/"]) break elif resource.normalized_url.startswith(collapsible[0]): new_u = collapsible[1] + resource.indexable_url[len(collapsible[0]):] from_urls.extend([new_u, f"{new_u}/"]) break children_query = Page.raw("""SELECT p_from.*, l.is_cross_host_like, MIN(c.timestamp) AS first_seen FROM page AS p_from JOIN indexable_crawl AS ic ON ic.page_id == p_from.id JOIN crawl AS c ON c.page_id == p_from.id JOIN link as l ON l.from_page_id == p_from.id JOIN page as p_to ON p_to.id == l.to_page_id WHERE p_to.url IN (""" + ", ".join(["?" for x in range(len(from_urls))]) + """) AND p_from.normalized_url != ? AND c.status == 20 AND p_from.content_type LIKE 'text/%' GROUP BY p_from.normalized_url ORDER BY l.is_cross_host_like, first_seen ASC""", *from_urls, resource.normalized_url) threadable_child_index = 1 new_thread_members = [( resource, path, first_seen, page_id, content, )] processed_collapsed_urls = [] for child in children_query.iterator(): collapsed_url = child.fetchable_url for collapsible in collapsible_log_variations: if child.normalized_url.startswith(collapsible[1]): collapsed_url = collapsible[0] + child.fetchable_url[len(collapsible[1]):] break if collapsed_url in processed_collapsed_urls: continue processed_collapsed_urls.append(collapsed_url) child_resource = GeminiResource(collapsed_url) if is_threadable_link(child_resource, resource, child.is_cross_host_like): # Skip any parents that are already in the list of seen resources for this call # stack - it means they're circular linking if any(r for r in current_chain if r.normalized_url == resource.normalized_url): continue child_path = f"{path:0>3}.{threadable_child_index:03}" new_thread_members.extend(recurse_thread( child_resource, child_path, child.first_seen, child.id, child.content, current_chain + [resource] )) threadable_child_index += 1 return new_thread_members def is_threadable_link(r1, r2, is_cross_host_like): return r1.is_log_post_like and r2.is_log_post_like and is_cross_host_like def main(): db = init_db(f"index/{constants.DB_FILENAME}") Thread.delete().execute() ThreadPage.delete().execute() pages_query = Page.raw("""SELECT p.*, MIN(c.timestamp) AS first_seen FROM page AS p JOIN indexable_crawl AS ic ON ic.page_id == p.id JOIN crawl AS c ON c.page_id == p.id LEFT JOIN threadpage AS tp ON tp.page_id == p.id WHERE tp.page_id IS NULL AND c.status == 20 AND p.content_type LIKE 'text/%' GROUP BY p.normalized_url """) for page in pages_query.iterator(): resource = GeminiResource(page.fetchable_url) if resource.is_valid and resource.is_log_post_like: find_thread_tops(resource, page.first_seen, page.id, page.content) print("\nDone!") if __name__ == "__main__": main()