代码
import json
import time
from datetime import datetime
from zoneinfo import ZoneInfo
from scrapy import Spider, Request
from urllib.parse import urlencode
from config import config
from .config import list_ids
from utils.spider_failed_alert import ErrorMonitor
"""
采集所有list动态,但是不包括评论, 官方接口
"""
class TwitterListOfficial(Spider):
author = 'drake.shi'
name = 'twitter_list_official'
MONGO_COL = 'twitter_list'
change_ua = False
# 不走代理
proxy = False
# 20分钟调度一次 (理论上提高频率可以获取更多的数据)
schedule_time = 12 * 20
api = "https://api.twitter.com/2/lists/{}/tweets"
custom_settings = {
'DNSCACHE_ENABLED': False,
'REACTOR_THREADPOOL_MAXSIZE': 1,
'DOWNLOAD_DELAY': 5
}
def start_requests(self):
BEARER_TOKEN = config.X_BEARER_TOKEN
for id in list_ids:
headers = {
"Authorization": f"Bearer {BEARER_TOKEN}",
"Connection": "close", # ✅ 关键项:不复用 keep-alive 长连接
}
params = {
"max_results": 100,
"tweet.fields": "attachments,author_id,context_annotations,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,lang,public_metrics,possibly_sensitive,referenced_tweets,reply_settings,source,text,withheld",
"expansions": "author_id,attachments.media_keys,attachments.poll_ids,geo.place_id,in_reply_to_user_id,referenced_tweets.id,referenced_tweets.id.author_id",
"user.fields": "created_at,description,entities,id,location,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,withheld",
"media.fields": "duration_ms,height,media_key,preview_image_url,type,url,width,public_metrics,alt_text",
"poll.fields": "duration_minutes,end_datetime,id,options,voting_status",
"place.fields": "contained_within,country,country_code,full_name,geo,id,name,place_type"
}
# params = {
# "tweet.fields": "created_at,author_id,text,public_metrics",
# "expansions": "author_id",
# "user.fields": "id,name,username,profile_image_url"
# }
url = self.api.format(id)
url = f"{url}?{urlencode(params)}"
yield Request(
url=url,
headers=headers
)
@ErrorMonitor(name, author)
def parse(self, response, **kwargs):
"""
每次请求返回的帖子数量约为100条
"""
data = json.loads(response.text)
# 帖子信息
tweets = data["data"]
# 用户信息
users = data.get("includes", {}).get("users", [])
user_map = {u["id"]: u for u in users}
# 重构数据结构
for tweet in tweets:
user = user_map.get(tweet["author_id"], {})
# https://pbs.twimg.com/profile_images/1675587952015974400/jvaLP8ty_normal.jpg
# 默认获取到的压缩后的头像,需要转换为原始头像(大图)
icon = user['profile_image_url']
icon = icon.replace('_normal', '')
user_info_ = {
# 昵称
'name': user['name'],
# 用户名
'user_name': user['username'],
# 用户ID
'user_id': user['id'],
'rest_id': user['id'],
# 钱包地址
'wallet_address':[],
# 粉丝量
'followers': user['public_metrics']['followers_count'],
# 关注量
'following': user['public_metrics']['following_count'],
# 点赞量
'favourites_count': user['public_metrics']['like_count'],
# 发帖量
'media_count': user['public_metrics']['media_count'],
'listed_count': user['public_metrics']['listed_count'],
'statuses_count': user['public_metrics']['tweet_count'],
# 头像
'icon': icon
}
# 转换成东八区的时间
created_at = tweet['created_at']
iso_ts = created_at.replace("Z", "+00:00") # "Z" → "+00:00"
dt_utc = datetime.fromisoformat(iso_ts)
dt_shanghai = dt_utc.astimezone(ZoneInfo("Asia/Shanghai"))
created_at_8 = dt_shanghai.strftime("%Y-%m-%d %H:%M:%S")
item = {
'_id': tweet['id'],
# 内容
'content': tweet['text'],
# 创建时间
'created_at': created_at_8,
# 评论内容
'comments': [],
# 点赞数
'favorite_count': tweet['public_metrics']['like_count'],
# 回复数
'reply_count': tweet['public_metrics']['reply_count'],
# 转发数
'retweet_count': tweet['public_metrics']['retweet_count'],
}
item['user_info'] = user_info_
yield item
响应体解析后的单个Item样例
{
"author_id": "1482629674035466240",
"text": "⚡️#TAO/USDT⚡️\n\n🟢LONG/BUY: 311.80 - 305.00\n\n🏹Targets: 317.00 - 325.00 - 335.00 - 350.00 - 380.00+🚀\n\n❌Stop Loss 298.00\n\n‼️Leverage: 20X 10X (Use Leverage according to your risk management)\n\n👉Use only upto 5% of Total Funds\n#qatar #UAE #Dubai #Kuwait\nhttps://t.co/132yLwxbph",
"possibly_sensitive": false,
"entities": {
"hashtags": [
{
"start": 2,
"end": 6,
"tag": "TAO"
},
{
"start": 224,
"end": 230,
"tag": "qatar"
},
{
"start": 231,
"end": 235,
"tag": "UAE"
},
{
"start": 236,
"end": 242,
"tag": "Dubai"
},
{
"start": 243,
"end": 250,
"tag": "Kuwait"
}
],
"urls": [
{
"start": 251,
"end": 274,
"url": "https://t.co/132yLwxbph",
"expanded_url": "https://t.me/+phaeKOjznO42ZmE8",
"display_url": "t.me/+phaeKOjznO42Z…",
"images": [
{
"url": "https://pbs.twimg.com/news_img/1912650169780150274/hEh0KHn2?format=png&name=orig",
"width": 256,
"height": 256
},
{
"url": "https://pbs.twimg.com/news_img/1912650169780150274/hEh0KHn2?format=png&name=150x150",
"width": 150,
"height": 150
}
],
"status": 200,
"title": "Join group chat on Telegram",
"description": "You are invited to a group chat on Telegram. Click to",
"unwound_url": "https://t.me/+phaeKOjznO42ZmE8"
}
],
"annotations": [
{
"start": 232,
"end": 234,
"probability": 0.905,
"type": "Place",
"normalized_text": "UAE"
},
{
"start": 237,
"end": 241,
"probability": 0.9325,
"type": "Place",
"normalized_text": "Dubai"
},
{
"start": 244,
"end": 249,
"probability": 0.8952,
"type": "Place",
"normalized_text": "Kuwait"
}
]
},
"reply_settings": "everyone",
"created_at": "2025-04-23T02:31:02.000Z",
"edit_history_tweet_ids": [
"1914869594365231388"
],
"lang": "en",
"public_metrics": {
"retweet_count": 0,
"reply_count": 0,
"like_count": 0,
"quote_count": 0,
"bookmark_count": 0,
"impression_count": 26
},
"id": "1914869594365231388",
"conversation_id": "1914869594365231388",
"context_annotations": [
{
"domain": {
"id": "46",
"name": "Business Taxonomy",
"description": "Categories within Brand Verticals that narrow down the scope of Brands"
},
"entity": {
"id": "1557696848252391426",
"name": "Financial Services Business",
"description": "Brands, companies, advertisers and every non-person handle with the profit intent related to Banks, Credit cards, Insurance, Investments, Stocks "
}
},
{
"domain": {
"id": "30",
"name": "Entities [Entity Service]",
"description": "Entity Service top level domain, every item that is in Entity Service should be in this domain"
},
"entity": {
"id": "1139229087682068480",
"name": "Tether cryptocurrency"
}
},
{
"domain": {
"id": "131",
"name": "Unified Twitter Taxonomy",
"description": "A taxonomy of user interests. "
},
"entity": {
"id": "913142676819648512",
"name": "Cryptocurrencies",
"description": "Cryptocurrency"
}
},
{
"domain": {
"id": "131",
"name": "Unified Twitter Taxonomy",
"description": "A taxonomy of user interests. "
},
"entity": {
"id": "1139229087682068480",
"name": "Tether cryptocurrency"
}
},
{
"domain": {
"id": "131",
"name": "Unified Twitter Taxonomy",
"description": "A taxonomy of user interests. "
},
"entity": {
"id": "1484181943616884743",
"name": "Cryptocoins"
}
},
{
"domain": {
"id": "131",
"name": "Unified Twitter Taxonomy",
"description": "A taxonomy of user interests. "
},
"entity": {
"id": "1492162686204854274",
"name": "Digital assets & cryptocurrency",
"description": "Cryptocurrency"
}
}
],
}