Add Twitter/X syndication API support for image extraction
Twitter/X blocks regular scrapers and doesn't provide og:image meta tags. This uses the cdn.syndication.twimg.com API to fetch tweet data including photos, video posters, and card thumbnails. Note: The API may return 404 for protected/deleted tweets or accounts with restricted access - in those cases we fall back gracefully.
This commit is contained in:
parent
a0699931b5
commit
0f9ee07092
1 changed files with 122 additions and 0 deletions
|
|
@ -11,6 +11,7 @@ import (
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/url"
|
"net/url"
|
||||||
"path"
|
"path"
|
||||||
|
"regexp"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
|
@ -21,6 +22,8 @@ import (
|
||||||
"git.soup.land/soup/lookbook/internal/data/item"
|
"git.soup.land/soup/lookbook/internal/data/item"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var tweetURLPattern = regexp.MustCompile(`^https?://(?:www\.)?(?:twitter\.com|x\.com)/([^/]+)/status/(\d+)`)
|
||||||
|
|
||||||
const thumbWidth = 480
|
const thumbWidth = 480
|
||||||
|
|
||||||
func CreateItemFromURL(ctx context.Context, db *sql.DB, sourceURL string) (item.Row, error) {
|
func CreateItemFromURL(ctx context.Context, db *sql.DB, sourceURL string) (item.Row, error) {
|
||||||
|
|
@ -66,6 +69,11 @@ type Metadata struct {
|
||||||
}
|
}
|
||||||
|
|
||||||
func FetchMetadata(ctx context.Context, sourceURL string) (Metadata, error) {
|
func FetchMetadata(ctx context.Context, sourceURL string) (Metadata, error) {
|
||||||
|
// Check if this is a Twitter/X URL and use syndication API
|
||||||
|
if meta, ok := fetchTwitterMetadata(ctx, sourceURL); ok {
|
||||||
|
return meta, nil
|
||||||
|
}
|
||||||
|
|
||||||
resp, err := fetchURL(ctx, sourceURL)
|
resp, err := fetchURL(ctx, sourceURL)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return Metadata{}, err
|
return Metadata{}, err
|
||||||
|
|
@ -129,6 +137,120 @@ type oEmbedResponse struct {
|
||||||
ProviderName string `json:"provider_name"`
|
ProviderName string `json:"provider_name"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Twitter/X syndication API response structures
|
||||||
|
type twitterSyndicationResponse struct {
|
||||||
|
Text string `json:"text"`
|
||||||
|
User twitterUser `json:"user"`
|
||||||
|
Photos []twitterPhoto `json:"photos"`
|
||||||
|
Video *twitterVideo `json:"video"`
|
||||||
|
Card *twitterCard `json:"card"`
|
||||||
|
Media []twitterMediaEntry `json:"mediaDetails"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type twitterUser struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
ScreenName string `json:"screen_name"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type twitterPhoto struct {
|
||||||
|
URL string `json:"url"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type twitterVideo struct {
|
||||||
|
Poster string `json:"poster"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type twitterCard struct {
|
||||||
|
ThumbnailImageOriginal string `json:"thumbnail_image_original"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type twitterMediaEntry struct {
|
||||||
|
MediaURLHTTPS string `json:"media_url_https"`
|
||||||
|
Type string `json:"type"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// fetchTwitterMetadata attempts to fetch metadata from Twitter's syndication API
|
||||||
|
// Returns the metadata and true if successful, or empty metadata and false if not a Twitter URL or fetch failed
|
||||||
|
func fetchTwitterMetadata(ctx context.Context, sourceURL string) (Metadata, bool) {
|
||||||
|
matches := tweetURLPattern.FindStringSubmatch(sourceURL)
|
||||||
|
if matches == nil {
|
||||||
|
return Metadata{}, false
|
||||||
|
}
|
||||||
|
|
||||||
|
username := matches[1]
|
||||||
|
tweetID := matches[2]
|
||||||
|
|
||||||
|
syndicationURL := fmt.Sprintf("https://cdn.syndication.twimg.com/tweet-result?id=%s&token=0", tweetID)
|
||||||
|
|
||||||
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, syndicationURL, nil)
|
||||||
|
if err != nil {
|
||||||
|
return Metadata{}, false
|
||||||
|
}
|
||||||
|
|
||||||
|
// The syndication API requires specific headers
|
||||||
|
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
|
||||||
|
req.Header.Set("Referer", "https://platform.twitter.com/")
|
||||||
|
|
||||||
|
client := &http.Client{Timeout: 12 * time.Second}
|
||||||
|
resp, err := client.Do(req)
|
||||||
|
if err != nil {
|
||||||
|
return Metadata{}, false
|
||||||
|
}
|
||||||
|
defer resp.Body.Close()
|
||||||
|
|
||||||
|
if resp.StatusCode != 200 {
|
||||||
|
return Metadata{}, false
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check content type - if it's HTML, it's an error page
|
||||||
|
contentType := resp.Header.Get("Content-Type")
|
||||||
|
if !strings.Contains(contentType, "application/json") {
|
||||||
|
return Metadata{}, false
|
||||||
|
}
|
||||||
|
|
||||||
|
var tweet twitterSyndicationResponse
|
||||||
|
if err := json.NewDecoder(resp.Body).Decode(&tweet); err != nil {
|
||||||
|
return Metadata{}, false
|
||||||
|
}
|
||||||
|
|
||||||
|
meta := Metadata{
|
||||||
|
Title: truncateText(tweet.Text, 200),
|
||||||
|
Description: fmt.Sprintf("@%s", username),
|
||||||
|
SiteName: "X",
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try to get image URL from various sources
|
||||||
|
// Priority: photos > video poster > card thumbnail > mediaDetails
|
||||||
|
if len(tweet.Photos) > 0 {
|
||||||
|
meta.ImageURL = tweet.Photos[0].URL
|
||||||
|
} else if tweet.Video != nil && tweet.Video.Poster != "" {
|
||||||
|
meta.ImageURL = tweet.Video.Poster
|
||||||
|
} else if tweet.Card != nil && tweet.Card.ThumbnailImageOriginal != "" {
|
||||||
|
meta.ImageURL = tweet.Card.ThumbnailImageOriginal
|
||||||
|
} else if len(tweet.Media) > 0 {
|
||||||
|
meta.ImageURL = tweet.Media[0].MediaURLHTTPS
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we got user info, use it for a better description
|
||||||
|
if tweet.User.Name != "" {
|
||||||
|
meta.Description = fmt.Sprintf("%s (@%s)", tweet.User.Name, tweet.User.ScreenName)
|
||||||
|
}
|
||||||
|
|
||||||
|
return meta, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func truncateText(s string, maxLen int) string {
|
||||||
|
if len(s) <= maxLen {
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
// Try to break at a word boundary
|
||||||
|
truncated := s[:maxLen]
|
||||||
|
if lastSpace := strings.LastIndex(truncated, " "); lastSpace > maxLen/2 {
|
||||||
|
truncated = truncated[:lastSpace]
|
||||||
|
}
|
||||||
|
return truncated + "…"
|
||||||
|
}
|
||||||
|
|
||||||
func fetchOEmbed(ctx context.Context, sourceURL string) (oEmbedResponse, error) {
|
func fetchOEmbed(ctx context.Context, sourceURL string) (oEmbedResponse, error) {
|
||||||
oembedURL := fmt.Sprintf("https://noembed.com/embed?url=%s", url.QueryEscape(sourceURL))
|
oembedURL := fmt.Sprintf("https://noembed.com/embed?url=%s", url.QueryEscape(sourceURL))
|
||||||
resp, err := fetchURL(ctx, oembedURL)
|
resp, err := fetchURL(ctx, oembedURL)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue