Add Twitter/X syndication API support for image extraction

Twitter/X blocks regular scrapers and doesn't provide og:image meta tags.
This uses the cdn.syndication.twimg.com API to fetch tweet data including
photos, video posters, and card thumbnails.

Note: The API may return 404 for protected/deleted tweets or accounts
with restricted access - in those cases we fall back gracefully.
This commit is contained in:
soup 2026-01-16 23:31:38 -05:00
parent a0699931b5
commit 0f9ee07092
Signed by: soup
SSH key fingerprint: SHA256:GYxje8eQkJ6HZKzVWDdyOUF1TyDiprruGhE0Ym8qYDY

View file

@ -11,6 +11,7 @@ import (
"net/http" "net/http"
"net/url" "net/url"
"path" "path"
"regexp"
"strings" "strings"
"time" "time"
@ -21,6 +22,8 @@ import (
"git.soup.land/soup/lookbook/internal/data/item" "git.soup.land/soup/lookbook/internal/data/item"
) )
var tweetURLPattern = regexp.MustCompile(`^https?://(?:www\.)?(?:twitter\.com|x\.com)/([^/]+)/status/(\d+)`)
const thumbWidth = 480 const thumbWidth = 480
func CreateItemFromURL(ctx context.Context, db *sql.DB, sourceURL string) (item.Row, error) { func CreateItemFromURL(ctx context.Context, db *sql.DB, sourceURL string) (item.Row, error) {
@ -66,6 +69,11 @@ type Metadata struct {
} }
func FetchMetadata(ctx context.Context, sourceURL string) (Metadata, error) { func FetchMetadata(ctx context.Context, sourceURL string) (Metadata, error) {
// Check if this is a Twitter/X URL and use syndication API
if meta, ok := fetchTwitterMetadata(ctx, sourceURL); ok {
return meta, nil
}
resp, err := fetchURL(ctx, sourceURL) resp, err := fetchURL(ctx, sourceURL)
if err != nil { if err != nil {
return Metadata{}, err return Metadata{}, err
@ -129,6 +137,120 @@ type oEmbedResponse struct {
ProviderName string `json:"provider_name"` ProviderName string `json:"provider_name"`
} }
// Twitter/X syndication API response structures
type twitterSyndicationResponse struct {
Text string `json:"text"`
User twitterUser `json:"user"`
Photos []twitterPhoto `json:"photos"`
Video *twitterVideo `json:"video"`
Card *twitterCard `json:"card"`
Media []twitterMediaEntry `json:"mediaDetails"`
}
type twitterUser struct {
Name string `json:"name"`
ScreenName string `json:"screen_name"`
}
type twitterPhoto struct {
URL string `json:"url"`
}
type twitterVideo struct {
Poster string `json:"poster"`
}
type twitterCard struct {
ThumbnailImageOriginal string `json:"thumbnail_image_original"`
}
type twitterMediaEntry struct {
MediaURLHTTPS string `json:"media_url_https"`
Type string `json:"type"`
}
// fetchTwitterMetadata attempts to fetch metadata from Twitter's syndication API
// Returns the metadata and true if successful, or empty metadata and false if not a Twitter URL or fetch failed
func fetchTwitterMetadata(ctx context.Context, sourceURL string) (Metadata, bool) {
matches := tweetURLPattern.FindStringSubmatch(sourceURL)
if matches == nil {
return Metadata{}, false
}
username := matches[1]
tweetID := matches[2]
syndicationURL := fmt.Sprintf("https://cdn.syndication.twimg.com/tweet-result?id=%s&token=0", tweetID)
req, err := http.NewRequestWithContext(ctx, http.MethodGet, syndicationURL, nil)
if err != nil {
return Metadata{}, false
}
// The syndication API requires specific headers
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
req.Header.Set("Referer", "https://platform.twitter.com/")
client := &http.Client{Timeout: 12 * time.Second}
resp, err := client.Do(req)
if err != nil {
return Metadata{}, false
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
return Metadata{}, false
}
// Check content type - if it's HTML, it's an error page
contentType := resp.Header.Get("Content-Type")
if !strings.Contains(contentType, "application/json") {
return Metadata{}, false
}
var tweet twitterSyndicationResponse
if err := json.NewDecoder(resp.Body).Decode(&tweet); err != nil {
return Metadata{}, false
}
meta := Metadata{
Title: truncateText(tweet.Text, 200),
Description: fmt.Sprintf("@%s", username),
SiteName: "X",
}
// Try to get image URL from various sources
// Priority: photos > video poster > card thumbnail > mediaDetails
if len(tweet.Photos) > 0 {
meta.ImageURL = tweet.Photos[0].URL
} else if tweet.Video != nil && tweet.Video.Poster != "" {
meta.ImageURL = tweet.Video.Poster
} else if tweet.Card != nil && tweet.Card.ThumbnailImageOriginal != "" {
meta.ImageURL = tweet.Card.ThumbnailImageOriginal
} else if len(tweet.Media) > 0 {
meta.ImageURL = tweet.Media[0].MediaURLHTTPS
}
// If we got user info, use it for a better description
if tweet.User.Name != "" {
meta.Description = fmt.Sprintf("%s (@%s)", tweet.User.Name, tweet.User.ScreenName)
}
return meta, true
}
func truncateText(s string, maxLen int) string {
if len(s) <= maxLen {
return s
}
// Try to break at a word boundary
truncated := s[:maxLen]
if lastSpace := strings.LastIndex(truncated, " "); lastSpace > maxLen/2 {
truncated = truncated[:lastSpace]
}
return truncated + "…"
}
func fetchOEmbed(ctx context.Context, sourceURL string) (oEmbedResponse, error) { func fetchOEmbed(ctx context.Context, sourceURL string) (oEmbedResponse, error) {
oembedURL := fmt.Sprintf("https://noembed.com/embed?url=%s", url.QueryEscape(sourceURL)) oembedURL := fmt.Sprintf("https://noembed.com/embed?url=%s", url.QueryEscape(sourceURL))
resp, err := fetchURL(ctx, oembedURL) resp, err := fetchURL(ctx, oembedURL)