lookbook/internal/services/metadata.go
soup 0f9ee07092
Add Twitter/X syndication API support for image extraction
Twitter/X blocks regular scrapers and doesn't provide og:image meta tags.
This uses the cdn.syndication.twimg.com API to fetch tweet data including
photos, video posters, and card thumbnails.

Note: The API may return 404 for protected/deleted tweets or accounts
with restricted access - in those cases we fall back gracefully.
2026-01-16 23:31:38 -05:00

410 lines
10 KiB
Go

package services
import (
"bytes"
"context"
"database/sql"
"encoding/json"
"fmt"
"io"
"mime"
"net/http"
"net/url"
"path"
"regexp"
"strings"
"time"
"github.com/disintegration/imaging"
"golang.org/x/net/html"
"git.soup.land/soup/lookbook/internal/data/image"
"git.soup.land/soup/lookbook/internal/data/item"
)
var tweetURLPattern = regexp.MustCompile(`^https?://(?:www\.)?(?:twitter\.com|x\.com)/([^/]+)/status/(\d+)`)
const thumbWidth = 480
func CreateItemFromURL(ctx context.Context, db *sql.DB, sourceURL string) (item.Row, error) {
meta, err := FetchMetadata(ctx, sourceURL)
if err != nil {
return item.Row{}, err
}
row, err := item.QCreate(ctx, db, sourceURL, meta.Title, meta.Description, meta.SiteName)
if err != nil {
return item.Row{}, err
}
if err := storeImages(ctx, db, row.ID, meta); err != nil {
return row, err
}
return row, nil
}
func RefreshItemFromURL(ctx context.Context, db *sql.DB, row item.Row) error {
meta, err := FetchMetadata(ctx, row.SourceURL)
if err != nil {
return err
}
if err := item.QUpdateMeta(ctx, db, row.ID, meta.Title, meta.Description, meta.SiteName); err != nil {
return err
}
if err := image.QDeleteByItem(ctx, db, row.ID); err != nil {
return err
}
return storeImages(ctx, db, row.ID, meta)
}
type Metadata struct {
Title string
Description string
SiteName string
ImageURL string
}
func FetchMetadata(ctx context.Context, sourceURL string) (Metadata, error) {
// Check if this is a Twitter/X URL and use syndication API
if meta, ok := fetchTwitterMetadata(ctx, sourceURL); ok {
return meta, nil
}
resp, err := fetchURL(ctx, sourceURL)
if err != nil {
return Metadata{}, err
}
defer resp.Body.Close()
body, err := io.ReadAll(io.LimitReader(resp.Body, 8<<20))
if err != nil {
return Metadata{}, err
}
meta := Metadata{}
contentType := resp.Header.Get("Content-Type")
meta.ImageURL = extractImageURL(resp.Request.URL, contentType)
if strings.HasPrefix(strings.ToLower(contentType), "image/") {
if meta.Title == "" {
meta.Title = path.Base(resp.Request.URL.Path)
}
if meta.SiteName == "" {
meta.SiteName = resp.Request.URL.Hostname()
}
return meta, nil
}
doc, err := html.Parse(bytes.NewReader(body))
if err != nil {
return meta, nil
}
extractMeta(doc, &meta)
if meta.Title == "" {
meta.Title = titleFromDoc(doc)
}
if meta.ImageURL == "" {
if oembed, err := fetchOEmbed(ctx, sourceURL); err == nil {
if meta.Title == "" {
meta.Title = oembed.Title
}
if meta.Description == "" {
meta.Description = oembed.Description
}
if meta.ImageURL == "" {
meta.ImageURL = oembed.ThumbnailURL
}
if meta.SiteName == "" {
meta.SiteName = oembed.ProviderName
}
}
}
return meta, nil
}
type oEmbedResponse struct {
Title string `json:"title"`
Description string `json:"description"`
ThumbnailURL string `json:"thumbnail_url"`
ProviderName string `json:"provider_name"`
}
// Twitter/X syndication API response structures
type twitterSyndicationResponse struct {
Text string `json:"text"`
User twitterUser `json:"user"`
Photos []twitterPhoto `json:"photos"`
Video *twitterVideo `json:"video"`
Card *twitterCard `json:"card"`
Media []twitterMediaEntry `json:"mediaDetails"`
}
type twitterUser struct {
Name string `json:"name"`
ScreenName string `json:"screen_name"`
}
type twitterPhoto struct {
URL string `json:"url"`
}
type twitterVideo struct {
Poster string `json:"poster"`
}
type twitterCard struct {
ThumbnailImageOriginal string `json:"thumbnail_image_original"`
}
type twitterMediaEntry struct {
MediaURLHTTPS string `json:"media_url_https"`
Type string `json:"type"`
}
// fetchTwitterMetadata attempts to fetch metadata from Twitter's syndication API
// Returns the metadata and true if successful, or empty metadata and false if not a Twitter URL or fetch failed
func fetchTwitterMetadata(ctx context.Context, sourceURL string) (Metadata, bool) {
matches := tweetURLPattern.FindStringSubmatch(sourceURL)
if matches == nil {
return Metadata{}, false
}
username := matches[1]
tweetID := matches[2]
syndicationURL := fmt.Sprintf("https://cdn.syndication.twimg.com/tweet-result?id=%s&token=0", tweetID)
req, err := http.NewRequestWithContext(ctx, http.MethodGet, syndicationURL, nil)
if err != nil {
return Metadata{}, false
}
// The syndication API requires specific headers
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
req.Header.Set("Referer", "https://platform.twitter.com/")
client := &http.Client{Timeout: 12 * time.Second}
resp, err := client.Do(req)
if err != nil {
return Metadata{}, false
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
return Metadata{}, false
}
// Check content type - if it's HTML, it's an error page
contentType := resp.Header.Get("Content-Type")
if !strings.Contains(contentType, "application/json") {
return Metadata{}, false
}
var tweet twitterSyndicationResponse
if err := json.NewDecoder(resp.Body).Decode(&tweet); err != nil {
return Metadata{}, false
}
meta := Metadata{
Title: truncateText(tweet.Text, 200),
Description: fmt.Sprintf("@%s", username),
SiteName: "X",
}
// Try to get image URL from various sources
// Priority: photos > video poster > card thumbnail > mediaDetails
if len(tweet.Photos) > 0 {
meta.ImageURL = tweet.Photos[0].URL
} else if tweet.Video != nil && tweet.Video.Poster != "" {
meta.ImageURL = tweet.Video.Poster
} else if tweet.Card != nil && tweet.Card.ThumbnailImageOriginal != "" {
meta.ImageURL = tweet.Card.ThumbnailImageOriginal
} else if len(tweet.Media) > 0 {
meta.ImageURL = tweet.Media[0].MediaURLHTTPS
}
// If we got user info, use it for a better description
if tweet.User.Name != "" {
meta.Description = fmt.Sprintf("%s (@%s)", tweet.User.Name, tweet.User.ScreenName)
}
return meta, true
}
func truncateText(s string, maxLen int) string {
if len(s) <= maxLen {
return s
}
// Try to break at a word boundary
truncated := s[:maxLen]
if lastSpace := strings.LastIndex(truncated, " "); lastSpace > maxLen/2 {
truncated = truncated[:lastSpace]
}
return truncated + "…"
}
func fetchOEmbed(ctx context.Context, sourceURL string) (oEmbedResponse, error) {
oembedURL := fmt.Sprintf("https://noembed.com/embed?url=%s", url.QueryEscape(sourceURL))
resp, err := fetchURL(ctx, oembedURL)
if err != nil {
return oEmbedResponse{}, err
}
defer resp.Body.Close()
var payload oEmbedResponse
if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil {
return oEmbedResponse{}, err
}
return payload, nil
}
func fetchURL(ctx context.Context, rawURL string) (*http.Response, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", "lookbook/1.0")
client := &http.Client{Timeout: 12 * time.Second}
resp, err := client.Do(req)
if err != nil {
return nil, err
}
if resp.StatusCode < 200 || resp.StatusCode >= 400 {
resp.Body.Close()
return nil, fmt.Errorf("fetch %s: status %d", rawURL, resp.StatusCode)
}
return resp, nil
}
func extractMeta(n *html.Node, meta *Metadata) {
if n.Type == html.ElementNode && n.Data == "meta" {
var property, content, name string
for _, attr := range n.Attr {
switch strings.ToLower(attr.Key) {
case "property":
property = strings.ToLower(attr.Val)
case "content":
content = strings.TrimSpace(attr.Val)
case "name":
name = strings.ToLower(attr.Val)
}
}
if content != "" {
switch property {
case "og:title", "twitter:title":
if meta.Title == "" {
meta.Title = content
}
case "og:description", "twitter:description":
if meta.Description == "" {
meta.Description = content
}
case "og:site_name":
if meta.SiteName == "" {
meta.SiteName = content
}
case "og:image", "twitter:image":
if meta.ImageURL == "" {
meta.ImageURL = content
}
}
if meta.Description == "" && name == "description" {
meta.Description = content
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
extractMeta(c, meta)
}
}
func titleFromDoc(n *html.Node) string {
if n.Type == html.ElementNode && n.Data == "title" && n.FirstChild != nil {
return strings.TrimSpace(n.FirstChild.Data)
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
if title := titleFromDoc(c); title != "" {
return title
}
}
return ""
}
func extractImageURL(baseURL *url.URL, contentType string) string {
if strings.HasPrefix(strings.ToLower(contentType), "image/") {
return baseURL.String()
}
return ""
}
func storeImages(ctx context.Context, db *sql.DB, itemID int64, meta Metadata) error {
if meta.ImageURL == "" {
return nil
}
resp, err := fetchURL(ctx, meta.ImageURL)
if err != nil {
return err
}
defer resp.Body.Close()
payload, err := io.ReadAll(io.LimitReader(resp.Body, 16<<20))
if err != nil {
return err
}
contentType := resp.Header.Get("Content-Type")
if contentType == "" {
contentType = mime.TypeByExtension(strings.ToLower(path.Ext(resp.Request.URL.Path)))
}
width, height, thumbBytes, thumbHeight, err := createThumb(payload)
if err != nil {
return err
}
_, err = image.QCreate(ctx, db, itemID, meta.ImageURL, contentType, payload, width, height, false)
if err != nil {
return err
}
if thumbBytes != nil {
_, err = image.QCreate(ctx, db, itemID, meta.ImageURL, thumbContentType(contentType), thumbBytes, thumbWidth, thumbHeight, true)
if err != nil {
return err
}
}
return nil
}
func createThumb(payload []byte) (int, int, []byte, int, error) {
img, err := imaging.Decode(bytes.NewReader(payload))
if err != nil {
return 0, 0, nil, 0, nil
}
bounds := img.Bounds()
width := bounds.Dx()
height := bounds.Dy()
if width <= thumbWidth {
return width, height, payload, height, nil
}
thumb := imaging.Resize(img, thumbWidth, 0, imaging.Lanczos)
buf := new(bytes.Buffer)
if err := imaging.Encode(buf, thumb, imaging.JPEG); err != nil {
return width, height, nil, 0, err
}
return width, height, buf.Bytes(), thumb.Bounds().Dy(), nil
}
func thumbContentType(_ string) string {
return "image/jpeg"
}