代码

import json
import time
from datetime import datetime
from zoneinfo import ZoneInfo
from scrapy import Spider, Request
from urllib.parse import urlencode
from config import config
from .config import list_ids
from utils.spider_failed_alert import ErrorMonitor

"""
采集所有list动态,但是不包括评论, 官方接口
"""

class TwitterListOfficial(Spider):
    author = 'drake.shi'
    name = 'twitter_list_official'
    MONGO_COL = 'twitter_list'
    change_ua = False
    # 不走代理
    proxy = False
    # 20分钟调度一次 (理论上提高频率可以获取更多的数据)
    schedule_time = 12 * 20
    api = "https://api.twitter.com/2/lists/{}/tweets"
    custom_settings = {
        'DNSCACHE_ENABLED': False,
        'REACTOR_THREADPOOL_MAXSIZE': 1,
        'DOWNLOAD_DELAY': 5
    }

    def start_requests(self):
        BEARER_TOKEN = config.X_BEARER_TOKEN

        for id in list_ids:
            headers = {
                "Authorization": f"Bearer {BEARER_TOKEN}",
                "Connection": "close",  # ✅ 关键项:不复用 keep-alive 长连接
            }
            params = {
                "max_results": 100,
                "tweet.fields": "attachments,author_id,context_annotations,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,lang,public_metrics,possibly_sensitive,referenced_tweets,reply_settings,source,text,withheld",
                "expansions": "author_id,attachments.media_keys,attachments.poll_ids,geo.place_id,in_reply_to_user_id,referenced_tweets.id,referenced_tweets.id.author_id",
                "user.fields": "created_at,description,entities,id,location,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,withheld",
                "media.fields": "duration_ms,height,media_key,preview_image_url,type,url,width,public_metrics,alt_text",
                "poll.fields": "duration_minutes,end_datetime,id,options,voting_status",
                "place.fields": "contained_within,country,country_code,full_name,geo,id,name,place_type"
            }
            # params = {
            #     "tweet.fields": "created_at,author_id,text,public_metrics",
            #     "expansions": "author_id",
            #     "user.fields": "id,name,username,profile_image_url"
            # }
            url = self.api.format(id)
            url = f"{url}?{urlencode(params)}"
            yield Request(
                url=url,
                headers=headers
            )

    @ErrorMonitor(name, author)
    def parse(self, response, **kwargs):
        """
        每次请求返回的帖子数量约为100条
        """
        data = json.loads(response.text)
        # 帖子信息
        tweets = data["data"]
        # 用户信息
        users = data.get("includes", {}).get("users", [])
        user_map = {u["id"]: u for u in users}
        # 重构数据结构
        for tweet in tweets:
            user = user_map.get(tweet["author_id"], {})
            # https://pbs.twimg.com/profile_images/1675587952015974400/jvaLP8ty_normal.jpg
            # 默认获取到的压缩后的头像,需要转换为原始头像(大图)
            icon = user['profile_image_url']
            icon = icon.replace('_normal', '')
            user_info_ = {
                # 昵称
                'name': user['name'],
                # 用户名
                'user_name': user['username'],
                # 用户ID
                'user_id': user['id'],
                'rest_id': user['id'],
                # 钱包地址
                'wallet_address':[],
                # 粉丝量
                'followers': user['public_metrics']['followers_count'],
                # 关注量
                'following': user['public_metrics']['following_count'],
                # 点赞量
                'favourites_count': user['public_metrics']['like_count'],
                # 发帖量
                'media_count': user['public_metrics']['media_count'],
                'listed_count': user['public_metrics']['listed_count'],
                'statuses_count': user['public_metrics']['tweet_count'],
                # 头像
                'icon': icon
            }
            # 转换成东八区的时间
            created_at = tweet['created_at']
            iso_ts = created_at.replace("Z", "+00:00")  # "Z" → "+00:00"
            dt_utc = datetime.fromisoformat(iso_ts)
            dt_shanghai = dt_utc.astimezone(ZoneInfo("Asia/Shanghai"))
            created_at_8 = dt_shanghai.strftime("%Y-%m-%d %H:%M:%S")

            item = {
                '_id': tweet['id'],
                # 内容
                'content': tweet['text'],
                # 创建时间
                'created_at': created_at_8,
                # 评论内容
                'comments': [],
                # 点赞数
                'favorite_count': tweet['public_metrics']['like_count'],
                # 回复数
                'reply_count':  tweet['public_metrics']['reply_count'],
                # 转发数
                'retweet_count': tweet['public_metrics']['retweet_count'],
            }
            item['user_info'] = user_info_
            yield item

响应体解析后的单个Item样例

{
    "author_id": "1482629674035466240",
    "text": "⚡️#TAO/USDT⚡️\n\n🟢LONG/BUY: 311.80 - 305.00\n\n🏹Targets:  317.00  -  325.00 - 335.00 - 350.00 - 380.00+🚀\n\n❌Stop Loss 298.00\n\n‼️Leverage: 20X 10X (Use Leverage according to your risk management)\n\n👉Use only upto 5% of Total Funds\n#qatar #UAE #Dubai #Kuwait\nhttps://t.co/132yLwxbph",
    "possibly_sensitive": false,
    "entities": {
        "hashtags": [
            {
                "start": 2,
                "end": 6,
                "tag": "TAO"
            },
            {
                "start": 224,
                "end": 230,
                "tag": "qatar"
            },
            {
                "start": 231,
                "end": 235,
                "tag": "UAE"
            },
            {
                "start": 236,
                "end": 242,
                "tag": "Dubai"
            },
            {
                "start": 243,
                "end": 250,
                "tag": "Kuwait"
            }
        ],
        "urls": [
            {
                "start": 251,
                "end": 274,
                "url": "https://t.co/132yLwxbph",
                "expanded_url": "https://t.me/+phaeKOjznO42ZmE8",
                "display_url": "t.me/+phaeKOjznO42Z…",
                "images": [
                    {
                        "url": "https://pbs.twimg.com/news_img/1912650169780150274/hEh0KHn2?format=png&name=orig",
                        "width": 256,
                        "height": 256
                    },
                    {
                        "url": "https://pbs.twimg.com/news_img/1912650169780150274/hEh0KHn2?format=png&name=150x150",
                        "width": 150,
                        "height": 150
                    }
                ],
                "status": 200,
                "title": "Join group chat on Telegram",
                "description": "You are invited to a group chat on Telegram. Click to",
                "unwound_url": "https://t.me/+phaeKOjznO42ZmE8"
            }
        ],
        "annotations": [
            {
                "start": 232,
                "end": 234,
                "probability": 0.905,
                "type": "Place",
                "normalized_text": "UAE"
            },
            {
                "start": 237,
                "end": 241,
                "probability": 0.9325,
                "type": "Place",
                "normalized_text": "Dubai"
            },
            {
                "start": 244,
                "end": 249,
                "probability": 0.8952,
                "type": "Place",
                "normalized_text": "Kuwait"
            }
        ]
    },
    "reply_settings": "everyone",
    "created_at": "2025-04-23T02:31:02.000Z",
    "edit_history_tweet_ids": [
        "1914869594365231388"
    ],
    "lang": "en",
    "public_metrics": {
        "retweet_count": 0,
        "reply_count": 0,
        "like_count": 0,
        "quote_count": 0,
        "bookmark_count": 0,
        "impression_count": 26
    },
    "id": "1914869594365231388",
    "conversation_id": "1914869594365231388",
    "context_annotations": [
        {
            "domain": {
                "id": "46",
                "name": "Business Taxonomy",
                "description": "Categories within Brand Verticals that narrow down the scope of Brands"
            },
            "entity": {
                "id": "1557696848252391426",
                "name": "Financial Services Business",
                "description": "Brands, companies, advertisers and every non-person handle with the profit intent related to Banks, Credit cards, Insurance, Investments, Stocks "
            }
        },
        {
            "domain": {
                "id": "30",
                "name": "Entities [Entity Service]",
                "description": "Entity Service top level domain, every item that is in Entity Service should be in this domain"
            },
            "entity": {
                "id": "1139229087682068480",
                "name": "Tether cryptocurrency"
            }
        },
        {
            "domain": {
                "id": "131",
                "name": "Unified Twitter Taxonomy",
                "description": "A taxonomy of user interests. "
            },
            "entity": {
                "id": "913142676819648512",
                "name": "Cryptocurrencies",
                "description": "Cryptocurrency"
            }
        },
        {
            "domain": {
                "id": "131",
                "name": "Unified Twitter Taxonomy",
                "description": "A taxonomy of user interests. "
            },
            "entity": {
                "id": "1139229087682068480",
                "name": "Tether cryptocurrency"
            }
        },
        {
            "domain": {
                "id": "131",
                "name": "Unified Twitter Taxonomy",
                "description": "A taxonomy of user interests. "
            },
            "entity": {
                "id": "1484181943616884743",
                "name": "Cryptocoins"
            }
        },
        {
            "domain": {
                "id": "131",
                "name": "Unified Twitter Taxonomy",
                "description": "A taxonomy of user interests. "
            },
            "entity": {
                "id": "1492162686204854274",
                "name": "Digital assets & cryptocurrency",
                "description": "Cryptocurrency"
            }
        }
    ],

}