play/scripts/emoji/scrape_guilds.py

85 lines
2.7 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
Scrape guilds with selenium
You probably shouldn't use this, but if you really must:
* Set the CHROME environment variable to the path to your chrome or chromium
* You probably want to connect to an existing chrome instance so you can log in first,
run `chromium --user-data-dir=$HOME/.cache/chromium-emoji-script --remote-debugging-port=9222`
to start chromium then `env CHROME=$(command -v chromium) ./scrape_guilds.py 127.0.0.1:99222`
* It just gives you json, you have to dump later
"""
import base64
import json
import os.path
import os
import sys
import zlib
from selenium import webdriver
import trio
def setup_driver():
options = webdriver.ChromeOptions()
if location := os.getenv("CHROME"):
options.binary_location = location
options.add_argument("--start-maximized")
options.add_argument(
"--user-data-dir=" + os.path.expanduser("~/.cache/chromium-emoji-script")
)
if len(sys.argv) > 1:
options.add_experimental_option("debuggerAddress", sys.argv[1])
# options.add_experimental_option("detach", True)
return webdriver.Chrome(options=options)
async def handle_events(listener, out_file):
valid_id = None
decompress = None
async for event in listener:
typ = event.__class__.__name__
if typ == "WebSocketCreated":
if event.url.startswith("wss://gateway.discord.gg/"):
valid_id = event.request_id
decompress = zlib.decompressobj()
elif typ == "WebSocketFrameReceived":
if event.request_id != valid_id:
continue
message = json.loads(
decompress.decompress(base64.b64decode(event.response.payload_data))
)
message_type = message.get("t")
await out_file.write(json.dumps(message) + "\n")
# The data we actually want, might as well flush
if message_type in ["READY", "GUILD_CREATE"]:
await out_file.flush()
print("Got message of type", message_type)
async def main():
out_path = await trio.Path.cwd() / "out" / "discord" / "events.json"
await out_path.parent.mkdir(parents=True, exist_ok=True)
out_file = await out_path.open("a")
driver = setup_driver()
async with driver.bidi_connection() as conn:
devtools, session = conn.devtools, conn.session
await session.execute(devtools.network.enable())
listener = session.listen(
devtools.network.WebSocketCreated,
devtools.network.WebSocketFrameReceived,
buffer_size=1024,
)
await handle_events(listener, out_file)
if __name__ == "__main__":
trio.run(main)