gemini - kennedy.gemi.dev

💾 Archived View for gmn.clttr.info › sources › geminispace.git › tree › scripts › build_threads.py.t… captured on 2022-06-11 at 23:40:45.
-=-=-=-=-=-=-
from peewee import JOIN

from gus import constants
from gus.lib.db_model import init_db, Link, Page, Thread, ThreadPage
from gus.lib.gemini import GeminiResource

collapsible_log_variations = [
    ("gemini://gemini.circumlunar.space/~solderpunk/gemlog/", "gemini://gemini.circumlunar.space/~solderpunk/3albums/"),
    ("gemini://gemini.circumlunar.space/~solderpunk/gemlog/", "gemini://gemini.circumlunar.space/~solderpunk/hitenheroes/"),
    ("gemini://gemini.circumlunar.space/~solderpunk/gemlog/", "gemini://gemini.circumlunar.space/~solderpunk/cornedbeef/"),
    ("gemini://gemini.circumlunar.space/~", "gemini://gemini.circumlunar.space/users/"),
    ("gemini://cetacean.club", "gemini://maj.kahless.cetacean.club"),
]


def find_thread_tops(resource, first_seen, page_id, content, current_chain=[]):
    """
    This function will recursively walk up to the tops of all threads a given
    page belongs to, then call recurse_thread on each of them to actually build
    the full threads.
    """
    for collapsible in collapsible_log_variations:
        if resource.normalized_url.startswith(collapsible[1]):
            resource = GeminiResource(collapsible[0] + resource.fetchable_url[len(collapsible[1]):])
            break
    u = resource.indexable_url.rstrip("/")
    parent_pages_query = Page.raw("""SELECT p_to.*, l.is_cross_host_like, MIN(c.timestamp) AS first_seen
FROM page AS p_from
JOIN indexable_crawl AS ic
ON ic.page_id == p_to.id
JOIN crawl AS c
ON c.page_id == p_to.id
JOIN link as l
ON l.from_page_id == p_from.id
JOIN page as p_to
ON p_to.id == l.to_page_id
WHERE p_from.url IN (?, ?)
AND p_to.normalized_url != ?
AND c.status == 20
AND p_to.content_type LIKE 'text/%'
GROUP BY p_to.normalized_url
ORDER BY l.is_cross_host_like, p_to.url ASC""", u, f"{u}/", resource.normalized_url)
    found_threadable_parents = False
    for parent_page in parent_pages_query.iterator():
        parent_resource = GeminiResource(parent_page.fetchable_url)
        for collapsible in collapsible_log_variations:
            if resource.normalized_url.startswith(collapsible[1]):
                parent_resource = GeminiResource(collapsible[0] + resource.fetchable_url[len(collapsible[1]):])
                break
        # Skip any parents that are already in the list of seen resources for this call
        # stack - it means they're circular linking
        if any(r for r in current_chain if r.normalized_url == resource.normalized_url):
            continue
        if is_threadable_link(resource, parent_resource, parent_page.is_cross_host_like):
            found_threadable_parents = True
            find_thread_tops(
                parent_resource,
                parent_page.first_seen,
                parent_page.id,
                parent_page.content,
                current_chain + [resource])
    if not found_threadable_parents:
        # return early if thread top already processed
        try:
            query = ThreadPage.select().join(Page).where(Page.url == resource.indexable_url, ThreadPage.address == "001")
            query.get()
            print(f"\nAlready done: {resource.fetchable_url}")
            return
        except ThreadPage.DoesNotExist:
            pass
        full_thread = recurse_thread(resource, "001", first_seen, page_id, content)

        # Deduplicate
        full_thread.reverse()
        i = 0
        while i < len(full_thread):
            if any(x for x in full_thread[i+1:] if x[0].normalized_url == full_thread[i][0].normalized_url):
                full_thread.pop(i)
            else:
                i += 1
        full_thread.reverse()

        thread_updated_at = max(m[2] for m in full_thread)
        thread = Thread.create(updated_at=thread_updated_at)
        print()
        for m in full_thread:
            ThreadPage.create(
                thread=thread,
                page_id=m[3],
                address=m[1],
                friendly_author=m[0].get_friendly_author(m[4]),
                friendly_title=m[0].get_friendly_title(m[4]),
            )
            print(" -> [{:<19}] [{}] {}".format(m[1], m[2], m[0].fetchable_url))


def recurse_thread(resource, path, first_seen, page_id, content, current_chain=[]):
    if not resource.is_valid or not resource.is_log_post_like:
    # if not resource.is_valid:
        return []
    u = resource.indexable_url.rstrip("/")
    from_urls = [
        u,
        f"{u}/",
    ]
    for collapsible in collapsible_log_variations:
        if resource.normalized_url.startswith(collapsible[1]):
            new_u = collapsible[0] + resource.indexable_url[len(collapsible[1]):]
            from_urls.extend([new_u, f"{new_u}/"])
            break
        elif resource.normalized_url.startswith(collapsible[0]):
            new_u = collapsible[1] + resource.indexable_url[len(collapsible[0]):]
            from_urls.extend([new_u, f"{new_u}/"])
            break
    children_query = Page.raw("""SELECT p_from.*, l.is_cross_host_like, MIN(c.timestamp) AS first_seen
FROM page AS p_from
JOIN indexable_crawl AS ic
ON ic.page_id == p_from.id
JOIN crawl AS c
ON c.page_id == p_from.id
JOIN link as l
ON l.from_page_id == p_from.id
JOIN page as p_to
ON p_to.id == l.to_page_id
WHERE p_to.url IN (""" + ", ".join(["?" for x in range(len(from_urls))]) + """)
AND p_from.normalized_url != ?
AND c.status == 20
AND p_from.content_type LIKE 'text/%'
GROUP BY p_from.normalized_url
ORDER BY l.is_cross_host_like, first_seen ASC""", *from_urls, resource.normalized_url)
    threadable_child_index = 1
    new_thread_members = [(
        resource,
        path,
        first_seen,
        page_id,
        content,
    )]
    processed_collapsed_urls = []
    for child in children_query.iterator():
        collapsed_url = child.fetchable_url
        for collapsible in collapsible_log_variations:
            if child.normalized_url.startswith(collapsible[1]):
                collapsed_url = collapsible[0] + child.fetchable_url[len(collapsible[1]):]
                break
        if collapsed_url in processed_collapsed_urls:
            continue
        processed_collapsed_urls.append(collapsed_url)
        child_resource = GeminiResource(collapsed_url)
        if is_threadable_link(child_resource, resource, child.is_cross_host_like):
            # Skip any parents that are already in the list of seen resources for this call
            # stack - it means they're circular linking
            if any(r for r in current_chain if r.normalized_url == resource.normalized_url):
                continue
            child_path = f"{path:0>3}.{threadable_child_index:03}"
            new_thread_members.extend(recurse_thread(
                child_resource,
                child_path,
                child.first_seen,
                child.id,
                child.content,
                current_chain + [resource]
            ))
            threadable_child_index += 1
    return new_thread_members


def is_threadable_link(r1, r2, is_cross_host_like):
    return r1.is_log_post_like and r2.is_log_post_like and is_cross_host_like


def main():
    db = init_db(f"index/{constants.DB_FILENAME}")
    Thread.delete().execute()
    ThreadPage.delete().execute()
    pages_query = Page.raw("""SELECT p.*, MIN(c.timestamp) AS first_seen
FROM page AS p
JOIN indexable_crawl AS ic
ON ic.page_id == p.id
JOIN crawl AS c
ON c.page_id == p.id
LEFT JOIN threadpage AS tp
ON tp.page_id == p.id
WHERE tp.page_id IS NULL
AND c.status == 20
AND p.content_type LIKE 'text/%'
GROUP BY p.normalized_url
""")
    for page in pages_query.iterator():
        resource = GeminiResource(page.fetchable_url)
        if resource.is_valid and resource.is_log_post_like:
            find_thread_tops(resource, page.first_seen, page.id, page.content)
    print("\nDone!")


if __name__ == "__main__":
    main()