play/scripts/emoji/scrape_guilds.py

62 lines
1.7 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
Scrape guilds with selenium
You probably shouldn't use this, but if you really must:
* Set the CHROME environment variable to the path to your chrome or chromium
* It just gives you json, you have to dump later
"""
import base64
import json
import os
2024-10-31 05:43:36 +00:00
import os.path
import pathlib
import sys
import zlib
2024-10-31 05:43:36 +00:00
from playwright.async_api import async_playwright
import asyncio
2024-10-31 05:43:36 +00:00
def websocket_handler(out_file):
async def inner(ws):
print(f"Got websocket at `{ws.url}`")
2024-10-31 05:43:36 +00:00
decompress = zlib.decompressobj()
2024-10-31 05:43:36 +00:00
async def handle_message(msg):
message = json.loads(decompress.decompress(msg))
message_type = message.get("t")
2024-10-31 05:43:36 +00:00
out_file.write(json.dumps(message) + "\n")
# The data we actually want, might as well flush
if message_type in ["READY", "GUILD_CREATE"]:
2024-10-31 05:43:36 +00:00
out_file.flush()
print("Got message of type", message_type)
2024-10-31 05:43:36 +00:00
ws.on("framereceived", handle_message)
2024-10-31 05:43:36 +00:00
return inner
2024-10-31 05:43:36 +00:00
async def main():
out_path = pathlib.Path.cwd() / "out" / "discord" / "events.json"
out_path.parent.mkdir(parents=True, exist_ok=True)
out_file = out_path.open("a")
async with async_playwright() as p:
browser = await p.chromium.launch_persistent_context(
user_data_dir=os.path.expanduser("~/.cache/chromium-emoji-script"),
executable_path=os.getenv("CHROME"),
headless=False,
)
2024-10-31 05:43:36 +00:00
page = await browser.new_page()
page.on("websocket", websocket_handler(out_file))
await page.goto("https://discord.com/app")
2024-10-31 05:43:36 +00:00
await asyncio.Future()
if __name__ == "__main__":
2024-10-31 05:43:36 +00:00
asyncio.run(main())