scrape_guilds.py: use playwright now
This commit is contained in:
parent
a233cfa432
commit
5d7349062d
24
flake.nix
24
flake.nix
|
@ -125,30 +125,8 @@
|
|||
python3Packages.aiohttp
|
||||
python3Packages.black
|
||||
python3Packages.ipython
|
||||
python3Packages.trio
|
||||
python3Packages.gql
|
||||
(python3Packages.selenium.overrideAttrs (old: {
|
||||
postInstall =
|
||||
old.postInstall
|
||||
+ ''
|
||||
for ver in v85 v126 v127 v128; do
|
||||
DEVTOOLS=../common/devtools/chromium/$ver
|
||||
for proto in js browser; do
|
||||
python3 ../common/devtools/convert_protocol_to_json.py \
|
||||
$DEVTOOLS/"$proto"_protocol.pdl \
|
||||
--map_binary_to_string=true \
|
||||
$DEVTOOLS/"$proto"_protocol.json
|
||||
done
|
||||
mkdir -p $DST_PREFIX/common/devtools/$ver
|
||||
python3 generate.py \
|
||||
$DEVTOOLS/browser_protocol.json \
|
||||
$DEVTOOLS/js_protocol.json \
|
||||
$DST_PREFIX/common/devtools/$ver
|
||||
done
|
||||
'';
|
||||
|
||||
nativeBuildInputs = old.nativeBuildInputs ++ [ python3Packages.inflection ];
|
||||
}))
|
||||
python3Packages.playwright
|
||||
];
|
||||
|
||||
CHROME = "${chromium}/bin/chromium";
|
||||
|
|
|
@ -3,82 +3,59 @@
|
|||
Scrape guilds with selenium
|
||||
You probably shouldn't use this, but if you really must:
|
||||
* Set the CHROME environment variable to the path to your chrome or chromium
|
||||
* You probably want to connect to an existing chrome instance so you can log in first,
|
||||
run `chromium --user-data-dir=$HOME/.cache/chromium-emoji-script --remote-debugging-port=9222`
|
||||
to start chromium then `env CHROME=$(command -v chromium) ./scrape_guilds.py 127.0.0.1:99222`
|
||||
* It just gives you json, you have to dump later
|
||||
|
||||
"""
|
||||
|
||||
import base64
|
||||
import json
|
||||
import os.path
|
||||
import os
|
||||
import os.path
|
||||
import pathlib
|
||||
import sys
|
||||
import zlib
|
||||
|
||||
from selenium import webdriver
|
||||
import trio
|
||||
from playwright.async_api import async_playwright
|
||||
import asyncio
|
||||
|
||||
|
||||
def setup_driver():
|
||||
options = webdriver.ChromeOptions()
|
||||
if location := os.getenv("CHROME"):
|
||||
options.binary_location = location
|
||||
options.add_argument("--start-maximized")
|
||||
options.add_argument(
|
||||
"--user-data-dir=" + os.path.expanduser("~/.cache/chromium-emoji-script")
|
||||
)
|
||||
if len(sys.argv) > 1:
|
||||
options.add_experimental_option("debuggerAddress", sys.argv[1])
|
||||
def websocket_handler(out_file):
|
||||
async def inner(ws):
|
||||
print(f"Got websocket at `{ws.url}`")
|
||||
|
||||
# options.add_experimental_option("detach", True)
|
||||
return webdriver.Chrome(options=options)
|
||||
|
||||
|
||||
async def handle_events(listener, out_file):
|
||||
valid_id = None
|
||||
decompress = None
|
||||
|
||||
async for event in listener:
|
||||
typ = event.__class__.__name__
|
||||
if typ == "WebSocketCreated":
|
||||
if event.url.startswith("wss://gateway.discord.gg/"):
|
||||
valid_id = event.request_id
|
||||
decompress = zlib.decompressobj()
|
||||
elif typ == "WebSocketFrameReceived":
|
||||
if event.request_id != valid_id:
|
||||
continue
|
||||
message = json.loads(
|
||||
decompress.decompress(base64.b64decode(event.response.payload_data))
|
||||
)
|
||||
|
||||
async def handle_message(msg):
|
||||
message = json.loads(decompress.decompress(msg))
|
||||
message_type = message.get("t")
|
||||
await out_file.write(json.dumps(message) + "\n")
|
||||
out_file.write(json.dumps(message) + "\n")
|
||||
# The data we actually want, might as well flush
|
||||
if message_type in ["READY", "GUILD_CREATE"]:
|
||||
await out_file.flush()
|
||||
|
||||
out_file.flush()
|
||||
print("Got message of type", message_type)
|
||||
|
||||
ws.on("framereceived", handle_message)
|
||||
|
||||
return inner
|
||||
|
||||
|
||||
async def main():
|
||||
out_path = await trio.Path.cwd() / "out" / "discord" / "events.json"
|
||||
await out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
out_file = await out_path.open("a")
|
||||
out_path = pathlib.Path.cwd() / "out" / "discord" / "events.json"
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
out_file = out_path.open("a")
|
||||
|
||||
driver = setup_driver()
|
||||
async with driver.bidi_connection() as conn:
|
||||
devtools, session = conn.devtools, conn.session
|
||||
|
||||
await session.execute(devtools.network.enable())
|
||||
listener = session.listen(
|
||||
devtools.network.WebSocketCreated,
|
||||
devtools.network.WebSocketFrameReceived,
|
||||
buffer_size=1024,
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch_persistent_context(
|
||||
user_data_dir=os.path.expanduser("~/.cache/chromium-emoji-script"),
|
||||
executable_path=os.getenv("CHROME"),
|
||||
headless=False,
|
||||
)
|
||||
page = await browser.new_page()
|
||||
page.on("websocket", websocket_handler(out_file))
|
||||
await page.goto("https://discord.com/app")
|
||||
|
||||
await handle_events(listener, out_file)
|
||||
await asyncio.Future()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
trio.run(main)
|
||||
asyncio.run(main())
|
||||
|
|
Loading…
Reference in a new issue