My goal is to generate an 8-bit serial data signal and its clock (similar to SPI bus). High-speed serialization can be achieved using SerDes (Serializer Deserializer) dedicated logic in Xilinx Spartan-6 FPGA.
This post is entirely based on the Xilinx application note [XAPP1064]. General clock configurations are also described in [UG382] at section "High-Speed IOSERDES2 Usage for Advanced Serialization" p.31 (in v1.9).
Hard-wired features coming along with I/O pads

One I/O Bank (IOB) contains 4 pads, which are coupled (2 and 2) for differencial mode.

IOB are grouped together into I/O clocking regions.

Two clocking regions share the same "REG" block (① BUFIO2, ② BUFIO2FB and ③ BUFPLL). According to [UG382]:
To improve the performance of the I/O tile, Spartan-6 FPGAs contain a dedicated I/O clock network for the connections where performance is most critical.

Each IOB has an inner and outer TIOI (① ILOGIC2, ② OLIGIC and ③ IODELAY). The ILOGIC2 (input) and the OLOGIC2 (output) contains the serializer (OLOGIC2) and deserializer (ILOGIC2).
SerDes configuration
The target configuration is found in [XAPP1064] at p.19 (v1.2). The Only difference is the usage of "single-ended" (se) transmission lines, while the application note uses "differencial" lines.

OSERDES is documented in [UG381] (p.91, v1.6).

In addition to data (D), 3-state information (T) can be given to the OSERDES2 block.
Verilog implementation
module serdes_sdr_se(
clk_txio,
clk_data,
strobe_tx,
data_in,
reset,
data_out,
);
input clk_txio; // high-speed clock ("serial" data)
input clk_data; // low-speed clock ("parallel" data)
input strobe_tx; // data strobe signal
input [7:0] data_in; // parallel data input
input reset; // reset serialization registers (async)
output data_out; // serial data output
wire data_out_unbuf; // un-buffered data out
wire cascade_di; // slave -> master
wire cascade_ti; // slave -> master
wire cascade_do; // master -> slave
wire cascade_to; // master -> slave
parameter integer DATA_LEN = 8;
// output buffer instance (single-ended)
OBUF io_data_out (
.O (data_out),
.I (data_out_unbuf)
);
// block attribute
OSERDES2 #(
.DATA_RATE_OQ ("SDR"), // SDR, DDR
.DATA_RATE_OT ("SDR"), // SDR, DDR
.DATA_WIDTH (DATA_LEN), // [2 ... 8]
.OUTPUT_MODE ("DIFFERENTIAL"), // SINGLE_ENDED, DIFFERENTIAL
.SERDES_MODE ("MASTER"), // MASTER, SLAVE
.TRAIN_PATTERN (0) // [0 ... 15]
)
// master serdes instance
oserdes_m (
// input
.CLK0 (clk_txio), // only for SDR ("serial" operations)
.CLK1 (), // only for DDR ("serial" operations)
.CLKDIV (clk_data), // clock for "parallel" operations
.IOCE (strobe_tx),
.D4 (data_in[7]),
.D3 (data_in[6]),
.D2 (data_in[5]),
.D1 (data_in[4]),
.OCE (1'b1), // data clock enable
.RST (reset), // (async)
.T1 (1'b0),
.T2 (1'b0),
.T3 (1'b0),
.T4 (1'b0),
.TCE (1'b1), // 3-state clock enable
.SHIFTIN1 (1'b1), // [D cascade] (dummy input in master, data)
.SHIFTIN2 (1'b1), // [T cascade] (dummy input in master, 3-state)
.SHIFTIN3 (cascade_do), // [Differential D] (dummy input in slave mode, data)
.SHIFTIN4 (cascade_to), // [Differential T] (dummy input in slave mode, 3-state)
.TRAIN (1'b0), // training pattern enable
// output
.OQ (data_out_unbuf), // data output
.TQ (), // 3-state output
.SHIFTOUT1 (cascade_di), // [D cascade] (dummy input in slave, data)
.SHIFTOUT2 (cascade_ti), // [T cascade] (dummy input in slave, 3-state)
.SHIFTOUT3 (), // [Differential D] (dummy input in master mode, data)
.SHIFTOUT4 () // [Differential T] (dummy input in master mode, 3-state)
);
// block attribute
OSERDES2 #(
.DATA_RATE_OQ ("SDR"), // SDR, DDR
.DATA_RATE_OT ("SDR"), // SDR, DDR
.DATA_WIDTH (DATA_LEN), // [2 ... 8]
.OUTPUT_MODE ("DIFFERENTIAL"), // SINGLE_ENDED, DIFFERENTIAL
.SERDES_MODE ("SLAVE"), // MASTER, SLAVE
.TRAIN_PATTERN (0) // [0 ... 15]
)
// slave serdes instance
oserdes_s (
// input
.CLK0 (clk_txio), // only for SDR ("serial" operations)
.CLK1 (), // only for DDR ("serial" operations)
.CLKDIV (clk_data), // clock for "parallel" operations
.IOCE (strobe_tx),
.D4 (data_in[3]),
.D3 (data_in[2]),
.D2 (data_in[1]),
.D1 (data_in[0]),
.OCE (1'b1), // data clock enable
.RST (reset), // (async)
.T1 (1'b0),
.T2 (1'b0),
.T3 (1'b0),
.T4 (1'b0),
.TCE (1'b1), // 3-state clock enable
.SHIFTIN1 (cascade_di), // [D cascade] (dummy input in master, data)
.SHIFTIN2 (cascade_ti), // [T cascade] (dummy input in master, 3-state)
.SHIFTIN3 (1'b1), // [Differential D] (dummy input in slave mode, data)
.SHIFTIN4 (1'b1), // [Differential T] (dummy input in slave mode, 3-state)
.TRAIN (1'b0), // training pattern enable
// output
.OQ (), // data output
.TQ (), // 3-state output
.SHIFTOUT1 (), // [D cascade] (dummy input in slave, data)
.SHIFTOUT2 (), // [T cascade] (dummy input in slave, 3-state)
.SHIFTOUT3 (cascade_do), // [Differential D] (dummy input in master mode, data)
.SHIFTOUT4 (cascade_to) // [Differential T] (dummy input in master mode, 3-state)
);
endmodule
module clockgen_sdr_se(
clk_in,
clk_out,
strobe_out,
clk_x2_out,
strobe_x2_out,
clk_div_out,
);
input clk_in; // high-speed clock ("serial" data)
output clk_out; // same as clk_in (BUFIO2 used)
output strobe_out; // strobe output of BUFIO2 for clk_out
output clk_x2_out; // clk_in double speed (BUFIO2 used)
output strobe_x2_out; // strobe output of BUFIO2 for clk_x2_out
output clk_div_out; // clk_in divided by serialization width (="parallel" clock)
parameter integer DATA_LEN = 8;
wire clockgen_in; // IOB -> BUFIO2
wire gclk_in; // BUFIO2 -> BUFG
IBUFG clk_iob_in (
.I (clk_in),
.O (clockgen_in)
);
// "parallel" data clock (div) & "serial" data clock (x1)
BUFIO2 #(
.DIVIDE (DATA_LEN),
.I_INVERT ("FALSE"),
.DIVIDE_BYPASS ("FALSE"),
.USE_DOUBLER ("FALSE")
)
bufio2_data (
.I (clockgen_in),
.IOCLK (clk_out),
.DIVCLK (gclk_in),
.SERDESSTROBE (strobe_out)
);
BUFG bufg_tx (
.I (gclk_in),
.O (clk_div_out)
);
// clock forward (x2)
BUFIO2 #(
.DIVIDE (DATA_LEN),
.I_INVERT ("FALSE"),
.DIVIDE_BYPASS ("FALSE"),
.USE_DOUBLER ("TRUE")
)
bufio2_clock (
.I (clockgen_in),
.IOCLK (clk_x2_out),
.DIVCLK (),
.SERDESSTROBE (strobe_x2_out)
);
endmodule
Top-level component

The pad selection data_out and clock_out is quite tricky because of the limited options for optimized local I/O clock lines.
According to the datasheet [DS162] ("Output Serializer/Deserializer Switching Characteristics" p.46 v3.1), CLKDIV max frequency is 250MHz (for -2 speed grade). At "Clock Buffers and Networks" p.56 v3.1, BUFIO2 max frenquency is 500MHz (speed grade -2).
The initial test was successful at 50MHz.
[UG381] | Spartan-6 FPGA SelectIO Resources http://www.xilinx.com/support/documentation/user_guides/ug381.pdf |
[UG382] | (1, 2) Spartan-6 FPGA Clocking Resources http://www.xilinx.com/support/documentation/user_guides/ug382.pdf |
[XAPP1064] | (1, 2) Source-Synchronous Serialization and Deserialization (up to 1050 Mb/s) http://www.xilinx.com/support/documentation/application_notes/xapp1064.pdf |
[1] | Source code in relation with XAPP1064 https://secure.xilinx.com/webreg/clickthrough.do?cid=140956 |
[DS162] | Spartan-6 FPGA Data Sheet: DC and Switching Characteristics http://www.xilinx.com/support/documentation/data_sheets/ds162.pdf |