Add Twitter/X syndication API support for image extraction
Twitter/X blocks regular scrapers and doesn't provide og:image meta tags. This uses the cdn.syndication.twimg.com API to fetch tweet data including photos, video posters, and card thumbnails. Note: The API may return 404 for protected/deleted tweets or accounts with restricted access - in those cases we fall back gracefully.
This commit is contained in:
parent
a0699931b5
commit
0f9ee07092
1 changed files with 122 additions and 0 deletions
|
|
@ -11,6 +11,7 @@ import (
|
|||
"net/http"
|
||||
"net/url"
|
||||
"path"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
|
|
@ -21,6 +22,8 @@ import (
|
|||
"git.soup.land/soup/lookbook/internal/data/item"
|
||||
)
|
||||
|
||||
var tweetURLPattern = regexp.MustCompile(`^https?://(?:www\.)?(?:twitter\.com|x\.com)/([^/]+)/status/(\d+)`)
|
||||
|
||||
const thumbWidth = 480
|
||||
|
||||
func CreateItemFromURL(ctx context.Context, db *sql.DB, sourceURL string) (item.Row, error) {
|
||||
|
|
@ -66,6 +69,11 @@ type Metadata struct {
|
|||
}
|
||||
|
||||
func FetchMetadata(ctx context.Context, sourceURL string) (Metadata, error) {
|
||||
// Check if this is a Twitter/X URL and use syndication API
|
||||
if meta, ok := fetchTwitterMetadata(ctx, sourceURL); ok {
|
||||
return meta, nil
|
||||
}
|
||||
|
||||
resp, err := fetchURL(ctx, sourceURL)
|
||||
if err != nil {
|
||||
return Metadata{}, err
|
||||
|
|
@ -129,6 +137,120 @@ type oEmbedResponse struct {
|
|||
ProviderName string `json:"provider_name"`
|
||||
}
|
||||
|
||||
// Twitter/X syndication API response structures
|
||||
type twitterSyndicationResponse struct {
|
||||
Text string `json:"text"`
|
||||
User twitterUser `json:"user"`
|
||||
Photos []twitterPhoto `json:"photos"`
|
||||
Video *twitterVideo `json:"video"`
|
||||
Card *twitterCard `json:"card"`
|
||||
Media []twitterMediaEntry `json:"mediaDetails"`
|
||||
}
|
||||
|
||||
type twitterUser struct {
|
||||
Name string `json:"name"`
|
||||
ScreenName string `json:"screen_name"`
|
||||
}
|
||||
|
||||
type twitterPhoto struct {
|
||||
URL string `json:"url"`
|
||||
}
|
||||
|
||||
type twitterVideo struct {
|
||||
Poster string `json:"poster"`
|
||||
}
|
||||
|
||||
type twitterCard struct {
|
||||
ThumbnailImageOriginal string `json:"thumbnail_image_original"`
|
||||
}
|
||||
|
||||
type twitterMediaEntry struct {
|
||||
MediaURLHTTPS string `json:"media_url_https"`
|
||||
Type string `json:"type"`
|
||||
}
|
||||
|
||||
// fetchTwitterMetadata attempts to fetch metadata from Twitter's syndication API
|
||||
// Returns the metadata and true if successful, or empty metadata and false if not a Twitter URL or fetch failed
|
||||
func fetchTwitterMetadata(ctx context.Context, sourceURL string) (Metadata, bool) {
|
||||
matches := tweetURLPattern.FindStringSubmatch(sourceURL)
|
||||
if matches == nil {
|
||||
return Metadata{}, false
|
||||
}
|
||||
|
||||
username := matches[1]
|
||||
tweetID := matches[2]
|
||||
|
||||
syndicationURL := fmt.Sprintf("https://cdn.syndication.twimg.com/tweet-result?id=%s&token=0", tweetID)
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, syndicationURL, nil)
|
||||
if err != nil {
|
||||
return Metadata{}, false
|
||||
}
|
||||
|
||||
// The syndication API requires specific headers
|
||||
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
|
||||
req.Header.Set("Referer", "https://platform.twitter.com/")
|
||||
|
||||
client := &http.Client{Timeout: 12 * time.Second}
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return Metadata{}, false
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != 200 {
|
||||
return Metadata{}, false
|
||||
}
|
||||
|
||||
// Check content type - if it's HTML, it's an error page
|
||||
contentType := resp.Header.Get("Content-Type")
|
||||
if !strings.Contains(contentType, "application/json") {
|
||||
return Metadata{}, false
|
||||
}
|
||||
|
||||
var tweet twitterSyndicationResponse
|
||||
if err := json.NewDecoder(resp.Body).Decode(&tweet); err != nil {
|
||||
return Metadata{}, false
|
||||
}
|
||||
|
||||
meta := Metadata{
|
||||
Title: truncateText(tweet.Text, 200),
|
||||
Description: fmt.Sprintf("@%s", username),
|
||||
SiteName: "X",
|
||||
}
|
||||
|
||||
// Try to get image URL from various sources
|
||||
// Priority: photos > video poster > card thumbnail > mediaDetails
|
||||
if len(tweet.Photos) > 0 {
|
||||
meta.ImageURL = tweet.Photos[0].URL
|
||||
} else if tweet.Video != nil && tweet.Video.Poster != "" {
|
||||
meta.ImageURL = tweet.Video.Poster
|
||||
} else if tweet.Card != nil && tweet.Card.ThumbnailImageOriginal != "" {
|
||||
meta.ImageURL = tweet.Card.ThumbnailImageOriginal
|
||||
} else if len(tweet.Media) > 0 {
|
||||
meta.ImageURL = tweet.Media[0].MediaURLHTTPS
|
||||
}
|
||||
|
||||
// If we got user info, use it for a better description
|
||||
if tweet.User.Name != "" {
|
||||
meta.Description = fmt.Sprintf("%s (@%s)", tweet.User.Name, tweet.User.ScreenName)
|
||||
}
|
||||
|
||||
return meta, true
|
||||
}
|
||||
|
||||
func truncateText(s string, maxLen int) string {
|
||||
if len(s) <= maxLen {
|
||||
return s
|
||||
}
|
||||
// Try to break at a word boundary
|
||||
truncated := s[:maxLen]
|
||||
if lastSpace := strings.LastIndex(truncated, " "); lastSpace > maxLen/2 {
|
||||
truncated = truncated[:lastSpace]
|
||||
}
|
||||
return truncated + "…"
|
||||
}
|
||||
|
||||
func fetchOEmbed(ctx context.Context, sourceURL string) (oEmbedResponse, error) {
|
||||
oembedURL := fmt.Sprintf("https://noembed.com/embed?url=%s", url.QueryEscape(sourceURL))
|
||||
resp, err := fetchURL(ctx, oembedURL)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue